]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
execute: split out mounting of credentials fs
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/mount.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
20 #if HAVE_PAM
21 #include <security/pam_appl.h>
22 #endif
23
24 #if HAVE_SELINUX
25 #include <selinux/selinux.h>
26 #endif
27
28 #if HAVE_SECCOMP
29 #include <seccomp.h>
30 #endif
31
32 #if HAVE_APPARMOR
33 #include <sys/apparmor.h>
34 #endif
35
36 #include "sd-messages.h"
37
38 #include "acl-util.h"
39 #include "af-list.h"
40 #include "alloc-util.h"
41 #if HAVE_APPARMOR
42 #include "apparmor-util.h"
43 #endif
44 #include "argv-util.h"
45 #include "async.h"
46 #include "barrier.h"
47 #include "bpf-lsm.h"
48 #include "btrfs-util.h"
49 #include "cap-list.h"
50 #include "capability-util.h"
51 #include "chattr-util.h"
52 #include "cgroup-setup.h"
53 #include "chase.h"
54 #include "chown-recursive.h"
55 #include "constants.h"
56 #include "cpu-set-util.h"
57 #include "creds-util.h"
58 #include "data-fd-util.h"
59 #include "env-file.h"
60 #include "env-util.h"
61 #include "errno-list.h"
62 #include "escape.h"
63 #include "execute.h"
64 #include "exit-status.h"
65 #include "fd-util.h"
66 #include "fileio.h"
67 #include "format-util.h"
68 #include "glob-util.h"
69 #include "hexdecoct.h"
70 #include "io-util.h"
71 #include "ioprio-util.h"
72 #include "label-util.h"
73 #include "lock-util.h"
74 #include "log.h"
75 #include "macro.h"
76 #include "manager.h"
77 #include "manager-dump.h"
78 #include "memory-util.h"
79 #include "missing_fs.h"
80 #include "missing_ioprio.h"
81 #include "missing_prctl.h"
82 #include "mkdir-label.h"
83 #include "mount-util.h"
84 #include "mountpoint-util.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "proc-cmdline.h"
89 #include "process-util.h"
90 #include "psi-util.h"
91 #include "random-util.h"
92 #include "recurse-dir.h"
93 #include "rlimit-util.h"
94 #include "rm-rf.h"
95 #if HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98 #include "securebits-util.h"
99 #include "selinux-util.h"
100 #include "signal-util.h"
101 #include "smack-util.h"
102 #include "socket-util.h"
103 #include "sort-util.h"
104 #include "special.h"
105 #include "stat-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
108 #include "strv.h"
109 #include "syslog-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-serialize.h"
114 #include "user-util.h"
115 #include "utmp-wtmp.h"
116
117 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
118 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
119
120 #define SNDBUF_SIZE (8*1024*1024)
121
122 static int shift_fds(int fds[], size_t n_fds) {
123 if (n_fds <= 0)
124 return 0;
125
126 /* Modifies the fds array! (sorts it) */
127
128 assert(fds);
129
130 for (int start = 0;;) {
131 int restart_from = -1;
132
133 for (int i = start; i < (int) n_fds; i++) {
134 int nfd;
135
136 /* Already at right index? */
137 if (fds[i] == i+3)
138 continue;
139
140 nfd = fcntl(fds[i], F_DUPFD, i + 3);
141 if (nfd < 0)
142 return -errno;
143
144 safe_close(fds[i]);
145 fds[i] = nfd;
146
147 /* Hmm, the fd we wanted isn't free? Then
148 * let's remember that and try again from here */
149 if (nfd != i+3 && restart_from < 0)
150 restart_from = i;
151 }
152
153 if (restart_from < 0)
154 break;
155
156 start = restart_from;
157 }
158
159 return 0;
160 }
161
162 static int flags_fds(
163 const int fds[],
164 size_t n_socket_fds,
165 size_t n_fds,
166 bool nonblock) {
167
168 int r;
169
170 if (n_fds <= 0)
171 return 0;
172
173 assert(fds);
174
175 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
176 * O_NONBLOCK only applies to socket activation though. */
177
178 for (size_t i = 0; i < n_fds; i++) {
179
180 if (i < n_socket_fds) {
181 r = fd_nonblock(fds[i], nonblock);
182 if (r < 0)
183 return r;
184 }
185
186 /* We unconditionally drop FD_CLOEXEC from the fds,
187 * since after all we want to pass these fds to our
188 * children */
189
190 r = fd_cloexec(fds[i], false);
191 if (r < 0)
192 return r;
193 }
194
195 return 0;
196 }
197
198 static const char *exec_context_tty_path(const ExecContext *context) {
199 assert(context);
200
201 if (context->stdio_as_fds)
202 return NULL;
203
204 if (context->tty_path)
205 return context->tty_path;
206
207 return "/dev/console";
208 }
209
210 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
211 _cleanup_free_ char *rowskey = NULL, *rowsvalue = NULL, *colskey = NULL, *colsvalue = NULL;
212 unsigned rows, cols;
213 const char *tty;
214 int r;
215
216 assert(context);
217 assert(ret_rows);
218 assert(ret_cols);
219
220 rows = context->tty_rows;
221 cols = context->tty_cols;
222
223 tty = exec_context_tty_path(context);
224 if (!tty || (rows != UINT_MAX && cols != UINT_MAX)) {
225 *ret_rows = rows;
226 *ret_cols = cols;
227 return 0;
228 }
229
230 tty = skip_dev_prefix(tty);
231 if (!in_charset(tty, ALPHANUMERICAL)) {
232 log_debug("%s contains non-alphanumeric characters, ignoring", tty);
233 *ret_rows = rows;
234 *ret_cols = cols;
235 return 0;
236 }
237
238 rowskey = strjoin("systemd.tty.rows.", tty);
239 if (!rowskey)
240 return -ENOMEM;
241
242 colskey = strjoin("systemd.tty.columns.", tty);
243 if (!colskey)
244 return -ENOMEM;
245
246 r = proc_cmdline_get_key_many(/* flags = */ 0,
247 rowskey, &rowsvalue,
248 colskey, &colsvalue);
249 if (r < 0)
250 log_debug_errno(r, "Failed to read TTY size of %s from kernel cmdline, ignoring: %m", tty);
251
252 if (rows == UINT_MAX && rowsvalue) {
253 r = safe_atou(rowsvalue, &rows);
254 if (r < 0)
255 log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", rowskey, rowsvalue);
256 }
257
258 if (cols == UINT_MAX && colsvalue) {
259 r = safe_atou(colsvalue, &cols);
260 if (r < 0)
261 log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", colskey, colsvalue);
262 }
263
264 *ret_rows = rows;
265 *ret_cols = cols;
266
267 return 0;
268 }
269
270 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
271 const char *path;
272
273 assert(context);
274
275 path = exec_context_tty_path(context);
276
277 if (context->tty_vhangup) {
278 if (p && p->stdin_fd >= 0)
279 (void) terminal_vhangup_fd(p->stdin_fd);
280 else if (path)
281 (void) terminal_vhangup(path);
282 }
283
284 if (context->tty_reset) {
285 if (p && p->stdin_fd >= 0)
286 (void) reset_terminal_fd(p->stdin_fd, true);
287 else if (path)
288 (void) reset_terminal(path);
289 }
290
291 if (p && p->stdin_fd >= 0) {
292 unsigned rows = context->tty_rows, cols = context->tty_cols;
293
294 (void) exec_context_tty_size(context, &rows, &cols);
295 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
296 }
297
298 if (context->tty_vt_disallocate && path)
299 (void) vt_disallocate(path);
300 }
301
302 static bool is_terminal_input(ExecInput i) {
303 return IN_SET(i,
304 EXEC_INPUT_TTY,
305 EXEC_INPUT_TTY_FORCE,
306 EXEC_INPUT_TTY_FAIL);
307 }
308
309 static bool is_terminal_output(ExecOutput o) {
310 return IN_SET(o,
311 EXEC_OUTPUT_TTY,
312 EXEC_OUTPUT_KMSG_AND_CONSOLE,
313 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
314 }
315
316 static bool is_kmsg_output(ExecOutput o) {
317 return IN_SET(o,
318 EXEC_OUTPUT_KMSG,
319 EXEC_OUTPUT_KMSG_AND_CONSOLE);
320 }
321
322 static bool exec_context_needs_term(const ExecContext *c) {
323 assert(c);
324
325 /* Return true if the execution context suggests we should set $TERM to something useful. */
326
327 if (is_terminal_input(c->std_input))
328 return true;
329
330 if (is_terminal_output(c->std_output))
331 return true;
332
333 if (is_terminal_output(c->std_error))
334 return true;
335
336 return !!c->tty_path;
337 }
338
339 static int open_null_as(int flags, int nfd) {
340 int fd;
341
342 assert(nfd >= 0);
343
344 fd = open("/dev/null", flags|O_NOCTTY);
345 if (fd < 0)
346 return -errno;
347
348 return move_fd(fd, nfd, false);
349 }
350
351 static int connect_journal_socket(
352 int fd,
353 const char *log_namespace,
354 uid_t uid,
355 gid_t gid) {
356
357 uid_t olduid = UID_INVALID;
358 gid_t oldgid = GID_INVALID;
359 const char *j;
360 int r;
361
362 j = log_namespace ?
363 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
364 "/run/systemd/journal/stdout";
365
366 if (gid_is_valid(gid)) {
367 oldgid = getgid();
368
369 if (setegid(gid) < 0)
370 return -errno;
371 }
372
373 if (uid_is_valid(uid)) {
374 olduid = getuid();
375
376 if (seteuid(uid) < 0) {
377 r = -errno;
378 goto restore_gid;
379 }
380 }
381
382 r = connect_unix_path(fd, AT_FDCWD, j);
383
384 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
385 an LSM interferes. */
386
387 if (uid_is_valid(uid))
388 (void) seteuid(olduid);
389
390 restore_gid:
391 if (gid_is_valid(gid))
392 (void) setegid(oldgid);
393
394 return r;
395 }
396
397 static int connect_logger_as(
398 const Unit *unit,
399 const ExecContext *context,
400 const ExecParameters *params,
401 ExecOutput output,
402 const char *ident,
403 int nfd,
404 uid_t uid,
405 gid_t gid) {
406
407 _cleanup_close_ int fd = -EBADF;
408 int r;
409
410 assert(context);
411 assert(params);
412 assert(output < _EXEC_OUTPUT_MAX);
413 assert(ident);
414 assert(nfd >= 0);
415
416 fd = socket(AF_UNIX, SOCK_STREAM, 0);
417 if (fd < 0)
418 return -errno;
419
420 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
421 if (r < 0)
422 return r;
423
424 if (shutdown(fd, SHUT_RD) < 0)
425 return -errno;
426
427 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
428
429 if (dprintf(fd,
430 "%s\n"
431 "%s\n"
432 "%i\n"
433 "%i\n"
434 "%i\n"
435 "%i\n"
436 "%i\n",
437 context->syslog_identifier ?: ident,
438 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
439 context->syslog_priority,
440 !!context->syslog_level_prefix,
441 false,
442 is_kmsg_output(output),
443 is_terminal_output(output)) < 0)
444 return -errno;
445
446 return move_fd(TAKE_FD(fd), nfd, false);
447 }
448
449 static int open_terminal_as(const char *path, int flags, int nfd) {
450 int fd;
451
452 assert(path);
453 assert(nfd >= 0);
454
455 fd = open_terminal(path, flags | O_NOCTTY);
456 if (fd < 0)
457 return fd;
458
459 return move_fd(fd, nfd, false);
460 }
461
462 static int acquire_path(const char *path, int flags, mode_t mode) {
463 _cleanup_close_ int fd = -EBADF;
464 int r;
465
466 assert(path);
467
468 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
469 flags |= O_CREAT;
470
471 fd = open(path, flags|O_NOCTTY, mode);
472 if (fd >= 0)
473 return TAKE_FD(fd);
474
475 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
476 return -errno;
477
478 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
479
480 fd = socket(AF_UNIX, SOCK_STREAM, 0);
481 if (fd < 0)
482 return -errno;
483
484 r = connect_unix_path(fd, AT_FDCWD, path);
485 if (IN_SET(r, -ENOTSOCK, -EINVAL))
486 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
487 * wasn't an AF_UNIX socket after all */
488 return -ENXIO;
489 if (r < 0)
490 return r;
491
492 if ((flags & O_ACCMODE) == O_RDONLY)
493 r = shutdown(fd, SHUT_WR);
494 else if ((flags & O_ACCMODE) == O_WRONLY)
495 r = shutdown(fd, SHUT_RD);
496 else
497 r = 0;
498 if (r < 0)
499 return -errno;
500
501 return TAKE_FD(fd);
502 }
503
504 static int fixup_input(
505 const ExecContext *context,
506 int socket_fd,
507 bool apply_tty_stdin) {
508
509 ExecInput std_input;
510
511 assert(context);
512
513 std_input = context->std_input;
514
515 if (is_terminal_input(std_input) && !apply_tty_stdin)
516 return EXEC_INPUT_NULL;
517
518 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
519 return EXEC_INPUT_NULL;
520
521 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
522 return EXEC_INPUT_NULL;
523
524 return std_input;
525 }
526
527 static int fixup_output(ExecOutput output, int socket_fd) {
528
529 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
530 return EXEC_OUTPUT_INHERIT;
531
532 return output;
533 }
534
535 static int setup_input(
536 const ExecContext *context,
537 const ExecParameters *params,
538 int socket_fd,
539 const int named_iofds[static 3]) {
540
541 ExecInput i;
542 int r;
543
544 assert(context);
545 assert(params);
546 assert(named_iofds);
547
548 if (params->stdin_fd >= 0) {
549 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
550 return -errno;
551
552 /* Try to make this the controlling tty, if it is a tty, and reset it */
553 if (isatty(STDIN_FILENO)) {
554 unsigned rows = context->tty_rows, cols = context->tty_cols;
555
556 (void) exec_context_tty_size(context, &rows, &cols);
557 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
558 (void) reset_terminal_fd(STDIN_FILENO, true);
559 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
560 }
561
562 return STDIN_FILENO;
563 }
564
565 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
566
567 switch (i) {
568
569 case EXEC_INPUT_NULL:
570 return open_null_as(O_RDONLY, STDIN_FILENO);
571
572 case EXEC_INPUT_TTY:
573 case EXEC_INPUT_TTY_FORCE:
574 case EXEC_INPUT_TTY_FAIL: {
575 unsigned rows, cols;
576 int fd;
577
578 fd = acquire_terminal(exec_context_tty_path(context),
579 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
580 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
581 ACQUIRE_TERMINAL_WAIT,
582 USEC_INFINITY);
583 if (fd < 0)
584 return fd;
585
586 r = exec_context_tty_size(context, &rows, &cols);
587 if (r < 0)
588 return r;
589
590 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
591 if (r < 0)
592 return r;
593
594 return move_fd(fd, STDIN_FILENO, false);
595 }
596
597 case EXEC_INPUT_SOCKET:
598 assert(socket_fd >= 0);
599
600 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
601
602 case EXEC_INPUT_NAMED_FD:
603 assert(named_iofds[STDIN_FILENO] >= 0);
604
605 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
606 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
607
608 case EXEC_INPUT_DATA: {
609 int fd;
610
611 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
612 if (fd < 0)
613 return fd;
614
615 return move_fd(fd, STDIN_FILENO, false);
616 }
617
618 case EXEC_INPUT_FILE: {
619 bool rw;
620 int fd;
621
622 assert(context->stdio_file[STDIN_FILENO]);
623
624 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
625 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
626
627 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
628 if (fd < 0)
629 return fd;
630
631 return move_fd(fd, STDIN_FILENO, false);
632 }
633
634 default:
635 assert_not_reached();
636 }
637 }
638
639 static bool can_inherit_stderr_from_stdout(
640 const ExecContext *context,
641 ExecOutput o,
642 ExecOutput e) {
643
644 assert(context);
645
646 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
647 * stderr fd */
648
649 if (e == EXEC_OUTPUT_INHERIT)
650 return true;
651 if (e != o)
652 return false;
653
654 if (e == EXEC_OUTPUT_NAMED_FD)
655 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
656
657 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
658 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
659
660 return true;
661 }
662
663 static int setup_output(
664 const Unit *unit,
665 const ExecContext *context,
666 const ExecParameters *params,
667 int fileno,
668 int socket_fd,
669 const int named_iofds[static 3],
670 const char *ident,
671 uid_t uid,
672 gid_t gid,
673 dev_t *journal_stream_dev,
674 ino_t *journal_stream_ino) {
675
676 ExecOutput o;
677 ExecInput i;
678 int r;
679
680 assert(unit);
681 assert(context);
682 assert(params);
683 assert(ident);
684 assert(journal_stream_dev);
685 assert(journal_stream_ino);
686
687 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
688
689 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
690 return -errno;
691
692 return STDOUT_FILENO;
693 }
694
695 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
696 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
697 return -errno;
698
699 return STDERR_FILENO;
700 }
701
702 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
703 o = fixup_output(context->std_output, socket_fd);
704
705 if (fileno == STDERR_FILENO) {
706 ExecOutput e;
707 e = fixup_output(context->std_error, socket_fd);
708
709 /* This expects the input and output are already set up */
710
711 /* Don't change the stderr file descriptor if we inherit all
712 * the way and are not on a tty */
713 if (e == EXEC_OUTPUT_INHERIT &&
714 o == EXEC_OUTPUT_INHERIT &&
715 i == EXEC_INPUT_NULL &&
716 !is_terminal_input(context->std_input) &&
717 getppid() != 1)
718 return fileno;
719
720 /* Duplicate from stdout if possible */
721 if (can_inherit_stderr_from_stdout(context, o, e))
722 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
723
724 o = e;
725
726 } else if (o == EXEC_OUTPUT_INHERIT) {
727 /* If input got downgraded, inherit the original value */
728 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
729 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
730
731 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
732 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
733 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
734
735 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
736 if (getppid() != 1)
737 return fileno;
738
739 /* We need to open /dev/null here anew, to get the right access mode. */
740 return open_null_as(O_WRONLY, fileno);
741 }
742
743 switch (o) {
744
745 case EXEC_OUTPUT_NULL:
746 return open_null_as(O_WRONLY, fileno);
747
748 case EXEC_OUTPUT_TTY:
749 if (is_terminal_input(i))
750 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
751
752 /* We don't reset the terminal if this is just about output */
753 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
754
755 case EXEC_OUTPUT_KMSG:
756 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
757 case EXEC_OUTPUT_JOURNAL:
758 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
759 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
760 if (r < 0) {
761 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
762 fileno == STDOUT_FILENO ? "stdout" : "stderr");
763 r = open_null_as(O_WRONLY, fileno);
764 } else {
765 struct stat st;
766
767 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
768 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
769 * services to detect whether they are connected to the journal or not.
770 *
771 * If both stdout and stderr are connected to a stream then let's make sure to store the data
772 * about STDERR as that's usually the best way to do logging. */
773
774 if (fstat(fileno, &st) >= 0 &&
775 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
776 *journal_stream_dev = st.st_dev;
777 *journal_stream_ino = st.st_ino;
778 }
779 }
780 return r;
781
782 case EXEC_OUTPUT_SOCKET:
783 assert(socket_fd >= 0);
784
785 return RET_NERRNO(dup2(socket_fd, fileno));
786
787 case EXEC_OUTPUT_NAMED_FD:
788 assert(named_iofds[fileno] >= 0);
789
790 (void) fd_nonblock(named_iofds[fileno], false);
791 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
792
793 case EXEC_OUTPUT_FILE:
794 case EXEC_OUTPUT_FILE_APPEND:
795 case EXEC_OUTPUT_FILE_TRUNCATE: {
796 bool rw;
797 int fd, flags;
798
799 assert(context->stdio_file[fileno]);
800
801 rw = context->std_input == EXEC_INPUT_FILE &&
802 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
803
804 if (rw)
805 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
806
807 flags = O_WRONLY;
808 if (o == EXEC_OUTPUT_FILE_APPEND)
809 flags |= O_APPEND;
810 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
811 flags |= O_TRUNC;
812
813 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
814 if (fd < 0)
815 return fd;
816
817 return move_fd(fd, fileno, 0);
818 }
819
820 default:
821 assert_not_reached();
822 }
823 }
824
825 static int chown_terminal(int fd, uid_t uid) {
826 int r;
827
828 assert(fd >= 0);
829
830 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
831 if (isatty(fd) < 1) {
832 if (IN_SET(errno, EINVAL, ENOTTY))
833 return 0; /* not a tty */
834
835 return -errno;
836 }
837
838 /* This might fail. What matters are the results. */
839 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
840 if (r < 0)
841 return r;
842
843 return 1;
844 }
845
846 static int setup_confirm_stdio(
847 const ExecContext *context,
848 const char *vc,
849 int *ret_saved_stdin,
850 int *ret_saved_stdout) {
851
852 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
853 unsigned rows, cols;
854 int r;
855
856 assert(ret_saved_stdin);
857 assert(ret_saved_stdout);
858
859 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
860 if (saved_stdin < 0)
861 return -errno;
862
863 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
864 if (saved_stdout < 0)
865 return -errno;
866
867 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
868 if (fd < 0)
869 return fd;
870
871 r = chown_terminal(fd, getuid());
872 if (r < 0)
873 return r;
874
875 r = reset_terminal_fd(fd, true);
876 if (r < 0)
877 return r;
878
879 r = exec_context_tty_size(context, &rows, &cols);
880 if (r < 0)
881 return r;
882
883 r = terminal_set_size_fd(fd, vc, rows, cols);
884 if (r < 0)
885 return r;
886
887 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
888 TAKE_FD(fd);
889 if (r < 0)
890 return r;
891
892 *ret_saved_stdin = TAKE_FD(saved_stdin);
893 *ret_saved_stdout = TAKE_FD(saved_stdout);
894 return 0;
895 }
896
897 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
898 assert(err < 0);
899
900 if (err == -ETIMEDOUT)
901 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
902 else {
903 errno = -err;
904 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
905 }
906 }
907
908 static void write_confirm_error(int err, const char *vc, const Unit *u) {
909 _cleanup_close_ int fd = -EBADF;
910
911 assert(vc);
912
913 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
914 if (fd < 0)
915 return;
916
917 write_confirm_error_fd(err, fd, u);
918 }
919
920 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
921 int r = 0;
922
923 assert(saved_stdin);
924 assert(saved_stdout);
925
926 release_terminal();
927
928 if (*saved_stdin >= 0)
929 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
930 r = -errno;
931
932 if (*saved_stdout >= 0)
933 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
934 r = -errno;
935
936 *saved_stdin = safe_close(*saved_stdin);
937 *saved_stdout = safe_close(*saved_stdout);
938
939 return r;
940 }
941
942 enum {
943 CONFIRM_PRETEND_FAILURE = -1,
944 CONFIRM_PRETEND_SUCCESS = 0,
945 CONFIRM_EXECUTE = 1,
946 };
947
948 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
949 int saved_stdout = -1, saved_stdin = -1, r;
950 _cleanup_free_ char *e = NULL;
951 char c;
952
953 /* For any internal errors, assume a positive response. */
954 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
955 if (r < 0) {
956 write_confirm_error(r, vc, u);
957 return CONFIRM_EXECUTE;
958 }
959
960 /* confirm_spawn might have been disabled while we were sleeping. */
961 if (manager_is_confirm_spawn_disabled(u->manager)) {
962 r = 1;
963 goto restore_stdio;
964 }
965
966 e = ellipsize(cmdline, 60, 100);
967 if (!e) {
968 log_oom();
969 r = CONFIRM_EXECUTE;
970 goto restore_stdio;
971 }
972
973 for (;;) {
974 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
975 if (r < 0) {
976 write_confirm_error_fd(r, STDOUT_FILENO, u);
977 r = CONFIRM_EXECUTE;
978 goto restore_stdio;
979 }
980
981 switch (c) {
982 case 'c':
983 printf("Resuming normal execution.\n");
984 manager_disable_confirm_spawn();
985 r = 1;
986 break;
987 case 'D':
988 unit_dump(u, stdout, " ");
989 continue; /* ask again */
990 case 'f':
991 printf("Failing execution.\n");
992 r = CONFIRM_PRETEND_FAILURE;
993 break;
994 case 'h':
995 printf(" c - continue, proceed without asking anymore\n"
996 " D - dump, show the state of the unit\n"
997 " f - fail, don't execute the command and pretend it failed\n"
998 " h - help\n"
999 " i - info, show a short summary of the unit\n"
1000 " j - jobs, show jobs that are in progress\n"
1001 " s - skip, don't execute the command and pretend it succeeded\n"
1002 " y - yes, execute the command\n");
1003 continue; /* ask again */
1004 case 'i':
1005 printf(" Description: %s\n"
1006 " Unit: %s\n"
1007 " Command: %s\n",
1008 u->id, u->description, cmdline);
1009 continue; /* ask again */
1010 case 'j':
1011 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
1012 continue; /* ask again */
1013 case 'n':
1014 /* 'n' was removed in favor of 'f'. */
1015 printf("Didn't understand 'n', did you mean 'f'?\n");
1016 continue; /* ask again */
1017 case 's':
1018 printf("Skipping execution.\n");
1019 r = CONFIRM_PRETEND_SUCCESS;
1020 break;
1021 case 'y':
1022 r = CONFIRM_EXECUTE;
1023 break;
1024 default:
1025 assert_not_reached();
1026 }
1027 break;
1028 }
1029
1030 restore_stdio:
1031 restore_confirm_stdio(&saved_stdin, &saved_stdout);
1032 return r;
1033 }
1034
1035 static int get_fixed_user(const ExecContext *c, const char **user,
1036 uid_t *uid, gid_t *gid,
1037 const char **home, const char **shell) {
1038 int r;
1039 const char *name;
1040
1041 assert(c);
1042
1043 if (!c->user)
1044 return 0;
1045
1046 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1047 * (i.e. are "/" or "/bin/nologin"). */
1048
1049 name = c->user;
1050 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1051 if (r < 0)
1052 return r;
1053
1054 *user = name;
1055 return 0;
1056 }
1057
1058 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1059 int r;
1060 const char *name;
1061
1062 assert(c);
1063
1064 if (!c->group)
1065 return 0;
1066
1067 name = c->group;
1068 r = get_group_creds(&name, gid, 0);
1069 if (r < 0)
1070 return r;
1071
1072 *group = name;
1073 return 0;
1074 }
1075
1076 static int get_supplementary_groups(const ExecContext *c, const char *user,
1077 const char *group, gid_t gid,
1078 gid_t **supplementary_gids, int *ngids) {
1079 int r, k = 0;
1080 int ngroups_max;
1081 bool keep_groups = false;
1082 gid_t *groups = NULL;
1083 _cleanup_free_ gid_t *l_gids = NULL;
1084
1085 assert(c);
1086
1087 /*
1088 * If user is given, then lookup GID and supplementary groups list.
1089 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1090 * here and as early as possible so we keep the list of supplementary
1091 * groups of the caller.
1092 */
1093 if (user && gid_is_valid(gid) && gid != 0) {
1094 /* First step, initialize groups from /etc/groups */
1095 if (initgroups(user, gid) < 0)
1096 return -errno;
1097
1098 keep_groups = true;
1099 }
1100
1101 if (strv_isempty(c->supplementary_groups))
1102 return 0;
1103
1104 /*
1105 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1106 * be positive, otherwise fail.
1107 */
1108 errno = 0;
1109 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1110 if (ngroups_max <= 0)
1111 return errno_or_else(EOPNOTSUPP);
1112
1113 l_gids = new(gid_t, ngroups_max);
1114 if (!l_gids)
1115 return -ENOMEM;
1116
1117 if (keep_groups) {
1118 /*
1119 * Lookup the list of groups that the user belongs to, we
1120 * avoid NSS lookups here too for gid=0.
1121 */
1122 k = ngroups_max;
1123 if (getgrouplist(user, gid, l_gids, &k) < 0)
1124 return -EINVAL;
1125 } else
1126 k = 0;
1127
1128 STRV_FOREACH(i, c->supplementary_groups) {
1129 const char *g;
1130
1131 if (k >= ngroups_max)
1132 return -E2BIG;
1133
1134 g = *i;
1135 r = get_group_creds(&g, l_gids+k, 0);
1136 if (r < 0)
1137 return r;
1138
1139 k++;
1140 }
1141
1142 /*
1143 * Sets ngids to zero to drop all supplementary groups, happens
1144 * when we are under root and SupplementaryGroups= is empty.
1145 */
1146 if (k == 0) {
1147 *ngids = 0;
1148 return 0;
1149 }
1150
1151 /* Otherwise get the final list of supplementary groups */
1152 groups = memdup(l_gids, sizeof(gid_t) * k);
1153 if (!groups)
1154 return -ENOMEM;
1155
1156 *supplementary_gids = groups;
1157 *ngids = k;
1158
1159 groups = NULL;
1160
1161 return 0;
1162 }
1163
1164 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1165 int r;
1166
1167 /* Handle SupplementaryGroups= if it is not empty */
1168 if (ngids > 0) {
1169 r = maybe_setgroups(ngids, supplementary_gids);
1170 if (r < 0)
1171 return r;
1172 }
1173
1174 if (gid_is_valid(gid)) {
1175 /* Then set our gids */
1176 if (setresgid(gid, gid, gid) < 0)
1177 return -errno;
1178 }
1179
1180 return 0;
1181 }
1182
1183 static int set_securebits(unsigned bits, unsigned mask) {
1184 unsigned applied;
1185 int current;
1186
1187 current = prctl(PR_GET_SECUREBITS);
1188 if (current < 0)
1189 return -errno;
1190
1191 /* Clear all securebits defined in mask and set bits */
1192 applied = ((unsigned) current & ~mask) | bits;
1193 if ((unsigned) current == applied)
1194 return 0;
1195
1196 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1197 return -errno;
1198
1199 return 1;
1200 }
1201
1202 static int enforce_user(
1203 const ExecContext *context,
1204 uid_t uid,
1205 uint64_t capability_ambient_set) {
1206 assert(context);
1207 int r;
1208
1209 if (!uid_is_valid(uid))
1210 return 0;
1211
1212 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1213 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1214 * case. */
1215
1216 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1217
1218 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1219 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1220 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1221 if (r < 0)
1222 return r;
1223 }
1224
1225 /* Second step: actually set the uids */
1226 if (setresuid(uid, uid, uid) < 0)
1227 return -errno;
1228
1229 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1230 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1231 * outside of this call. */
1232 return 0;
1233 }
1234
1235 #if HAVE_PAM
1236
1237 static int null_conv(
1238 int num_msg,
1239 const struct pam_message **msg,
1240 struct pam_response **resp,
1241 void *appdata_ptr) {
1242
1243 /* We don't support conversations */
1244
1245 return PAM_CONV_ERR;
1246 }
1247
1248 #endif
1249
1250 static int setup_pam(
1251 const char *name,
1252 const char *user,
1253 uid_t uid,
1254 gid_t gid,
1255 const char *tty,
1256 char ***env, /* updated on success */
1257 const int fds[], size_t n_fds) {
1258
1259 #if HAVE_PAM
1260
1261 static const struct pam_conv conv = {
1262 .conv = null_conv,
1263 .appdata_ptr = NULL
1264 };
1265
1266 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1267 _cleanup_strv_free_ char **e = NULL;
1268 pam_handle_t *handle = NULL;
1269 sigset_t old_ss;
1270 int pam_code = PAM_SUCCESS, r;
1271 bool close_session = false;
1272 pid_t pam_pid = 0, parent_pid;
1273 int flags = 0;
1274
1275 assert(name);
1276 assert(user);
1277 assert(env);
1278
1279 /* We set up PAM in the parent process, then fork. The child
1280 * will then stay around until killed via PR_GET_PDEATHSIG or
1281 * systemd via the cgroup logic. It will then remove the PAM
1282 * session again. The parent process will exec() the actual
1283 * daemon. We do things this way to ensure that the main PID
1284 * of the daemon is the one we initially fork()ed. */
1285
1286 r = barrier_create(&barrier);
1287 if (r < 0)
1288 goto fail;
1289
1290 if (log_get_max_level() < LOG_DEBUG)
1291 flags |= PAM_SILENT;
1292
1293 pam_code = pam_start(name, user, &conv, &handle);
1294 if (pam_code != PAM_SUCCESS) {
1295 handle = NULL;
1296 goto fail;
1297 }
1298
1299 if (!tty) {
1300 _cleanup_free_ char *q = NULL;
1301
1302 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1303 * out if that's the case, and read the TTY off it. */
1304
1305 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1306 tty = strjoina("/dev/", q);
1307 }
1308
1309 if (tty) {
1310 pam_code = pam_set_item(handle, PAM_TTY, tty);
1311 if (pam_code != PAM_SUCCESS)
1312 goto fail;
1313 }
1314
1315 STRV_FOREACH(nv, *env) {
1316 pam_code = pam_putenv(handle, *nv);
1317 if (pam_code != PAM_SUCCESS)
1318 goto fail;
1319 }
1320
1321 pam_code = pam_acct_mgmt(handle, flags);
1322 if (pam_code != PAM_SUCCESS)
1323 goto fail;
1324
1325 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1326 if (pam_code != PAM_SUCCESS)
1327 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1328
1329 pam_code = pam_open_session(handle, flags);
1330 if (pam_code != PAM_SUCCESS)
1331 goto fail;
1332
1333 close_session = true;
1334
1335 e = pam_getenvlist(handle);
1336 if (!e) {
1337 pam_code = PAM_BUF_ERR;
1338 goto fail;
1339 }
1340
1341 /* Block SIGTERM, so that we know that it won't get lost in the child */
1342
1343 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1344
1345 parent_pid = getpid_cached();
1346
1347 r = safe_fork("(sd-pam)", 0, &pam_pid);
1348 if (r < 0)
1349 goto fail;
1350 if (r == 0) {
1351 int sig, ret = EXIT_PAM;
1352
1353 /* The child's job is to reset the PAM session on termination */
1354 barrier_set_role(&barrier, BARRIER_CHILD);
1355
1356 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1357 * those fds are open here that have been opened by PAM. */
1358 (void) close_many(fds, n_fds);
1359
1360 /* Drop privileges - we don't need any to pam_close_session and this will make
1361 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1362 * threads to fail to exit normally */
1363
1364 r = maybe_setgroups(0, NULL);
1365 if (r < 0)
1366 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1367 if (setresgid(gid, gid, gid) < 0)
1368 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1369 if (setresuid(uid, uid, uid) < 0)
1370 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1371
1372 (void) ignore_signals(SIGPIPE);
1373
1374 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1375 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1376 * this way. We rely on the control groups kill logic to do the rest for us. */
1377 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1378 goto child_finish;
1379
1380 /* Tell the parent that our setup is done. This is especially important regarding dropping
1381 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1382 *
1383 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1384 (void) barrier_place(&barrier);
1385
1386 /* Check if our parent process might already have died? */
1387 if (getppid() == parent_pid) {
1388 sigset_t ss;
1389
1390 assert_se(sigemptyset(&ss) >= 0);
1391 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1392
1393 for (;;) {
1394 if (sigwait(&ss, &sig) < 0) {
1395 if (errno == EINTR)
1396 continue;
1397
1398 goto child_finish;
1399 }
1400
1401 assert(sig == SIGTERM);
1402 break;
1403 }
1404 }
1405
1406 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1407 if (pam_code != PAM_SUCCESS)
1408 goto child_finish;
1409
1410 /* If our parent died we'll end the session */
1411 if (getppid() != parent_pid) {
1412 pam_code = pam_close_session(handle, flags);
1413 if (pam_code != PAM_SUCCESS)
1414 goto child_finish;
1415 }
1416
1417 ret = 0;
1418
1419 child_finish:
1420 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1421 * know about this. See pam_end(3) */
1422 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1423 _exit(ret);
1424 }
1425
1426 barrier_set_role(&barrier, BARRIER_PARENT);
1427
1428 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1429 * here. */
1430 handle = NULL;
1431
1432 /* Unblock SIGTERM again in the parent */
1433 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1434
1435 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1436 * this fd around. */
1437 closelog();
1438
1439 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1440 * recover. However, warn loudly if it happens. */
1441 if (!barrier_place_and_sync(&barrier))
1442 log_error("PAM initialization failed");
1443
1444 return strv_free_and_replace(*env, e);
1445
1446 fail:
1447 if (pam_code != PAM_SUCCESS) {
1448 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1449 r = -EPERM; /* PAM errors do not map to errno */
1450 } else
1451 log_error_errno(r, "PAM failed: %m");
1452
1453 if (handle) {
1454 if (close_session)
1455 pam_code = pam_close_session(handle, flags);
1456
1457 (void) pam_end(handle, pam_code | flags);
1458 }
1459
1460 closelog();
1461 return r;
1462 #else
1463 return 0;
1464 #endif
1465 }
1466
1467 static void rename_process_from_path(const char *path) {
1468 _cleanup_free_ char *buf = NULL;
1469 const char *p;
1470
1471 assert(path);
1472
1473 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1474 * /bin/ps */
1475
1476 if (path_extract_filename(path, &buf) < 0) {
1477 rename_process("(...)");
1478 return;
1479 }
1480
1481 size_t l = strlen(buf);
1482 if (l > 8) {
1483 /* The end of the process name is usually more interesting, since the first bit might just be
1484 * "systemd-" */
1485 p = buf + l - 8;
1486 l = 8;
1487 } else
1488 p = buf;
1489
1490 char process_name[11];
1491 process_name[0] = '(';
1492 memcpy(process_name+1, p, l);
1493 process_name[1+l] = ')';
1494 process_name[1+l+1] = 0;
1495
1496 rename_process(process_name);
1497 }
1498
1499 static bool context_has_address_families(const ExecContext *c) {
1500 assert(c);
1501
1502 return c->address_families_allow_list ||
1503 !set_isempty(c->address_families);
1504 }
1505
1506 static bool context_has_syscall_filters(const ExecContext *c) {
1507 assert(c);
1508
1509 return c->syscall_allow_list ||
1510 !hashmap_isempty(c->syscall_filter);
1511 }
1512
1513 static bool context_has_syscall_logs(const ExecContext *c) {
1514 assert(c);
1515
1516 return c->syscall_log_allow_list ||
1517 !hashmap_isempty(c->syscall_log);
1518 }
1519
1520 static bool context_has_no_new_privileges(const ExecContext *c) {
1521 assert(c);
1522
1523 if (c->no_new_privileges)
1524 return true;
1525
1526 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1527 return false;
1528
1529 /* We need NNP if we have any form of seccomp and are unprivileged */
1530 return c->lock_personality ||
1531 c->memory_deny_write_execute ||
1532 c->private_devices ||
1533 c->protect_clock ||
1534 c->protect_hostname ||
1535 c->protect_kernel_tunables ||
1536 c->protect_kernel_modules ||
1537 c->protect_kernel_logs ||
1538 context_has_address_families(c) ||
1539 exec_context_restrict_namespaces_set(c) ||
1540 c->restrict_realtime ||
1541 c->restrict_suid_sgid ||
1542 !set_isempty(c->syscall_archs) ||
1543 context_has_syscall_filters(c) ||
1544 context_has_syscall_logs(c);
1545 }
1546
1547 bool exec_context_has_credentials(const ExecContext *context) {
1548
1549 assert(context);
1550
1551 return !hashmap_isempty(context->set_credentials) ||
1552 !hashmap_isempty(context->load_credentials) ||
1553 !set_isempty(context->import_credentials);
1554 }
1555
1556 #if HAVE_SECCOMP
1557
1558 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1559
1560 if (is_seccomp_available())
1561 return false;
1562
1563 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1564 return true;
1565 }
1566
1567 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1568 uint32_t negative_action, default_action, action;
1569 int r;
1570
1571 assert(u);
1572 assert(c);
1573
1574 if (!context_has_syscall_filters(c))
1575 return 0;
1576
1577 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1578 return 0;
1579
1580 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1581
1582 if (c->syscall_allow_list) {
1583 default_action = negative_action;
1584 action = SCMP_ACT_ALLOW;
1585 } else {
1586 default_action = SCMP_ACT_ALLOW;
1587 action = negative_action;
1588 }
1589
1590 if (needs_ambient_hack) {
1591 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1592 if (r < 0)
1593 return r;
1594 }
1595
1596 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1597 }
1598
1599 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1600 #ifdef SCMP_ACT_LOG
1601 uint32_t default_action, action;
1602 #endif
1603
1604 assert(u);
1605 assert(c);
1606
1607 if (!context_has_syscall_logs(c))
1608 return 0;
1609
1610 #ifdef SCMP_ACT_LOG
1611 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1612 return 0;
1613
1614 if (c->syscall_log_allow_list) {
1615 /* Log nothing but the ones listed */
1616 default_action = SCMP_ACT_ALLOW;
1617 action = SCMP_ACT_LOG;
1618 } else {
1619 /* Log everything but the ones listed */
1620 default_action = SCMP_ACT_LOG;
1621 action = SCMP_ACT_ALLOW;
1622 }
1623
1624 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1625 #else
1626 /* old libseccomp */
1627 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1628 return 0;
1629 #endif
1630 }
1631
1632 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1633 assert(u);
1634 assert(c);
1635
1636 if (set_isempty(c->syscall_archs))
1637 return 0;
1638
1639 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1640 return 0;
1641
1642 return seccomp_restrict_archs(c->syscall_archs);
1643 }
1644
1645 static int apply_address_families(const Unit* u, const ExecContext *c) {
1646 assert(u);
1647 assert(c);
1648
1649 if (!context_has_address_families(c))
1650 return 0;
1651
1652 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1653 return 0;
1654
1655 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1656 }
1657
1658 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1659 int r;
1660
1661 assert(u);
1662 assert(c);
1663
1664 if (!c->memory_deny_write_execute)
1665 return 0;
1666
1667 /* use prctl() if kernel supports it (6.3) */
1668 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1669 if (r == 0) {
1670 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1671 return 0;
1672 }
1673 if (r < 0 && errno != EINVAL)
1674 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1675 /* else use seccomp */
1676 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1677
1678 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1679 return 0;
1680
1681 return seccomp_memory_deny_write_execute();
1682 }
1683
1684 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1685 assert(u);
1686 assert(c);
1687
1688 if (!c->restrict_realtime)
1689 return 0;
1690
1691 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1692 return 0;
1693
1694 return seccomp_restrict_realtime();
1695 }
1696
1697 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1698 assert(u);
1699 assert(c);
1700
1701 if (!c->restrict_suid_sgid)
1702 return 0;
1703
1704 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1705 return 0;
1706
1707 return seccomp_restrict_suid_sgid();
1708 }
1709
1710 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1711 assert(u);
1712 assert(c);
1713
1714 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1715 * let's protect even those systems where this is left on in the kernel. */
1716
1717 if (!c->protect_kernel_tunables)
1718 return 0;
1719
1720 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1721 return 0;
1722
1723 return seccomp_protect_sysctl();
1724 }
1725
1726 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1727 assert(u);
1728 assert(c);
1729
1730 /* Turn off module syscalls on ProtectKernelModules=yes */
1731
1732 if (!c->protect_kernel_modules)
1733 return 0;
1734
1735 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1736 return 0;
1737
1738 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1739 }
1740
1741 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1742 assert(u);
1743 assert(c);
1744
1745 if (!c->protect_kernel_logs)
1746 return 0;
1747
1748 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1749 return 0;
1750
1751 return seccomp_protect_syslog();
1752 }
1753
1754 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1755 assert(u);
1756 assert(c);
1757
1758 if (!c->protect_clock)
1759 return 0;
1760
1761 if (skip_seccomp_unavailable(u, "ProtectClock="))
1762 return 0;
1763
1764 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1765 }
1766
1767 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1768 assert(u);
1769 assert(c);
1770
1771 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1772
1773 if (!c->private_devices)
1774 return 0;
1775
1776 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1777 return 0;
1778
1779 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1780 }
1781
1782 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1783 assert(u);
1784 assert(c);
1785
1786 if (!exec_context_restrict_namespaces_set(c))
1787 return 0;
1788
1789 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1790 return 0;
1791
1792 return seccomp_restrict_namespaces(c->restrict_namespaces);
1793 }
1794
1795 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1796 unsigned long personality;
1797 int r;
1798
1799 assert(u);
1800 assert(c);
1801
1802 if (!c->lock_personality)
1803 return 0;
1804
1805 if (skip_seccomp_unavailable(u, "LockPersonality="))
1806 return 0;
1807
1808 personality = c->personality;
1809
1810 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1811 if (personality == PERSONALITY_INVALID) {
1812
1813 r = opinionated_personality(&personality);
1814 if (r < 0)
1815 return r;
1816 }
1817
1818 return seccomp_lock_personality(personality);
1819 }
1820
1821 #endif
1822
1823 #if HAVE_LIBBPF
1824 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1825 assert(u);
1826 assert(c);
1827
1828 if (!exec_context_restrict_filesystems_set(c))
1829 return 0;
1830
1831 if (!u->manager->restrict_fs) {
1832 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1833 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1834 return 0;
1835 }
1836
1837 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1838 }
1839 #endif
1840
1841 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1842 assert(u);
1843 assert(c);
1844
1845 if (!c->protect_hostname)
1846 return 0;
1847
1848 if (ns_type_supported(NAMESPACE_UTS)) {
1849 if (unshare(CLONE_NEWUTS) < 0) {
1850 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1851 *ret_exit_status = EXIT_NAMESPACE;
1852 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1853 }
1854
1855 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1856 }
1857 } else
1858 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1859
1860 #if HAVE_SECCOMP
1861 int r;
1862
1863 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1864 return 0;
1865
1866 r = seccomp_protect_hostname();
1867 if (r < 0) {
1868 *ret_exit_status = EXIT_SECCOMP;
1869 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1870 }
1871 #endif
1872
1873 return 0;
1874 }
1875
1876 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1877 assert(idle_pipe);
1878
1879 idle_pipe[1] = safe_close(idle_pipe[1]);
1880 idle_pipe[2] = safe_close(idle_pipe[2]);
1881
1882 if (idle_pipe[0] >= 0) {
1883 int r;
1884
1885 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1886
1887 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1888 ssize_t n;
1889
1890 /* Signal systemd that we are bored and want to continue. */
1891 n = write(idle_pipe[3], "x", 1);
1892 if (n > 0)
1893 /* Wait for systemd to react to the signal above. */
1894 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1895 }
1896
1897 idle_pipe[0] = safe_close(idle_pipe[0]);
1898
1899 }
1900
1901 idle_pipe[3] = safe_close(idle_pipe[3]);
1902 }
1903
1904 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1905
1906 static int build_environment(
1907 const Unit *u,
1908 const ExecContext *c,
1909 const ExecParameters *p,
1910 const CGroupContext *cgroup_context,
1911 size_t n_fds,
1912 char **fdnames,
1913 const char *home,
1914 const char *username,
1915 const char *shell,
1916 dev_t journal_stream_dev,
1917 ino_t journal_stream_ino,
1918 const char *memory_pressure_path,
1919 char ***ret) {
1920
1921 _cleanup_strv_free_ char **our_env = NULL;
1922 size_t n_env = 0;
1923 char *x;
1924 int r;
1925
1926 assert(u);
1927 assert(c);
1928 assert(p);
1929 assert(ret);
1930
1931 #define N_ENV_VARS 19
1932 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1933 if (!our_env)
1934 return -ENOMEM;
1935
1936 if (n_fds > 0) {
1937 _cleanup_free_ char *joined = NULL;
1938
1939 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1940 return -ENOMEM;
1941 our_env[n_env++] = x;
1942
1943 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1944 return -ENOMEM;
1945 our_env[n_env++] = x;
1946
1947 joined = strv_join(fdnames, ":");
1948 if (!joined)
1949 return -ENOMEM;
1950
1951 x = strjoin("LISTEN_FDNAMES=", joined);
1952 if (!x)
1953 return -ENOMEM;
1954 our_env[n_env++] = x;
1955 }
1956
1957 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1958 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1959 return -ENOMEM;
1960 our_env[n_env++] = x;
1961
1962 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1963 return -ENOMEM;
1964 our_env[n_env++] = x;
1965 }
1966
1967 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1968 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1969 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1970 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1971 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1972 if (!x)
1973 return -ENOMEM;
1974 our_env[n_env++] = x;
1975 }
1976
1977 if (home) {
1978 x = strjoin("HOME=", home);
1979 if (!x)
1980 return -ENOMEM;
1981
1982 path_simplify(x + 5);
1983 our_env[n_env++] = x;
1984 }
1985
1986 if (username) {
1987 x = strjoin("LOGNAME=", username);
1988 if (!x)
1989 return -ENOMEM;
1990 our_env[n_env++] = x;
1991
1992 x = strjoin("USER=", username);
1993 if (!x)
1994 return -ENOMEM;
1995 our_env[n_env++] = x;
1996 }
1997
1998 if (shell) {
1999 x = strjoin("SHELL=", shell);
2000 if (!x)
2001 return -ENOMEM;
2002
2003 path_simplify(x + 6);
2004 our_env[n_env++] = x;
2005 }
2006
2007 if (!sd_id128_is_null(u->invocation_id)) {
2008 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
2009 return -ENOMEM;
2010
2011 our_env[n_env++] = x;
2012 }
2013
2014 if (exec_context_needs_term(c)) {
2015 _cleanup_free_ char *cmdline = NULL;
2016 const char *tty_path, *term = NULL;
2017
2018 tty_path = exec_context_tty_path(c);
2019
2020 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
2021 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
2022 * container manager passes to PID 1 ends up all the way in the console login shown. */
2023
2024 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
2025 term = getenv("TERM");
2026 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
2027 _cleanup_free_ char *key = NULL;
2028
2029 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
2030 if (!key)
2031 return -ENOMEM;
2032
2033 r = proc_cmdline_get_key(key, 0, &cmdline);
2034 if (r < 0)
2035 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2036 else if (r > 0)
2037 term = cmdline;
2038 }
2039
2040 if (!term)
2041 term = default_term_for_tty(tty_path);
2042
2043 x = strjoin("TERM=", term);
2044 if (!x)
2045 return -ENOMEM;
2046 our_env[n_env++] = x;
2047 }
2048
2049 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2050 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2051 return -ENOMEM;
2052
2053 our_env[n_env++] = x;
2054 }
2055
2056 if (c->log_namespace) {
2057 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2058 if (!x)
2059 return -ENOMEM;
2060
2061 our_env[n_env++] = x;
2062 }
2063
2064 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2065 _cleanup_free_ char *joined = NULL;
2066 const char *n;
2067
2068 if (!p->prefix[t])
2069 continue;
2070
2071 if (c->directories[t].n_items == 0)
2072 continue;
2073
2074 n = exec_directory_env_name_to_string(t);
2075 if (!n)
2076 continue;
2077
2078 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2079 _cleanup_free_ char *prefixed = NULL;
2080
2081 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2082 if (!prefixed)
2083 return -ENOMEM;
2084
2085 if (!strextend_with_separator(&joined, ":", prefixed))
2086 return -ENOMEM;
2087 }
2088
2089 x = strjoin(n, "=", joined);
2090 if (!x)
2091 return -ENOMEM;
2092
2093 our_env[n_env++] = x;
2094 }
2095
2096 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2097 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2098 if (!x)
2099 return -ENOMEM;
2100
2101 our_env[n_env++] = x;
2102 }
2103
2104 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2105 return -ENOMEM;
2106
2107 our_env[n_env++] = x;
2108
2109 if (memory_pressure_path) {
2110 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2111 if (!x)
2112 return -ENOMEM;
2113
2114 our_env[n_env++] = x;
2115
2116 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2117 _cleanup_free_ char *b = NULL, *e = NULL;
2118
2119 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2120 MEMORY_PRESSURE_DEFAULT_TYPE,
2121 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2122 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2123 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2124 return -ENOMEM;
2125
2126 if (base64mem(b, strlen(b) + 1, &e) < 0)
2127 return -ENOMEM;
2128
2129 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2130 if (!x)
2131 return -ENOMEM;
2132
2133 our_env[n_env++] = x;
2134 }
2135 }
2136
2137 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2138 #undef N_ENV_VARS
2139
2140 *ret = TAKE_PTR(our_env);
2141
2142 return 0;
2143 }
2144
2145 static int build_pass_environment(const ExecContext *c, char ***ret) {
2146 _cleanup_strv_free_ char **pass_env = NULL;
2147 size_t n_env = 0;
2148
2149 STRV_FOREACH(i, c->pass_environment) {
2150 _cleanup_free_ char *x = NULL;
2151 char *v;
2152
2153 v = getenv(*i);
2154 if (!v)
2155 continue;
2156 x = strjoin(*i, "=", v);
2157 if (!x)
2158 return -ENOMEM;
2159
2160 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2161 return -ENOMEM;
2162
2163 pass_env[n_env++] = TAKE_PTR(x);
2164 pass_env[n_env] = NULL;
2165 }
2166
2167 *ret = TAKE_PTR(pass_env);
2168
2169 return 0;
2170 }
2171
2172 bool exec_needs_network_namespace(const ExecContext *context) {
2173 assert(context);
2174
2175 return context->private_network || context->network_namespace_path;
2176 }
2177
2178 static bool exec_needs_ephemeral(const ExecContext *context) {
2179 return (context->root_image || context->root_directory) && context->root_ephemeral;
2180 }
2181
2182 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2183 assert(context);
2184
2185 return context->private_ipc || context->ipc_namespace_path;
2186 }
2187
2188 bool exec_needs_mount_namespace(
2189 const ExecContext *context,
2190 const ExecParameters *params,
2191 const ExecRuntime *runtime) {
2192
2193 assert(context);
2194
2195 if (context->root_image)
2196 return true;
2197
2198 if (!strv_isempty(context->read_write_paths) ||
2199 !strv_isempty(context->read_only_paths) ||
2200 !strv_isempty(context->inaccessible_paths) ||
2201 !strv_isempty(context->exec_paths) ||
2202 !strv_isempty(context->no_exec_paths))
2203 return true;
2204
2205 if (context->n_bind_mounts > 0)
2206 return true;
2207
2208 if (context->n_temporary_filesystems > 0)
2209 return true;
2210
2211 if (context->n_mount_images > 0)
2212 return true;
2213
2214 if (context->n_extension_images > 0)
2215 return true;
2216
2217 if (!strv_isempty(context->extension_directories))
2218 return true;
2219
2220 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2221 return true;
2222
2223 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2224 return true;
2225
2226 if (context->private_devices ||
2227 context->private_mounts > 0 ||
2228 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2229 context->protect_system != PROTECT_SYSTEM_NO ||
2230 context->protect_home != PROTECT_HOME_NO ||
2231 context->protect_kernel_tunables ||
2232 context->protect_kernel_modules ||
2233 context->protect_kernel_logs ||
2234 context->protect_control_groups ||
2235 context->protect_proc != PROTECT_PROC_DEFAULT ||
2236 context->proc_subset != PROC_SUBSET_ALL ||
2237 exec_needs_ipc_namespace(context))
2238 return true;
2239
2240 if (context->root_directory) {
2241 if (exec_context_get_effective_mount_apivfs(context))
2242 return true;
2243
2244 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2245 if (params && !params->prefix[t])
2246 continue;
2247
2248 if (context->directories[t].n_items > 0)
2249 return true;
2250 }
2251 }
2252
2253 if (context->dynamic_user &&
2254 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2255 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2256 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2257 return true;
2258
2259 if (context->log_namespace)
2260 return true;
2261
2262 return false;
2263 }
2264
2265 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2266 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2267 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2268 _cleanup_close_ int unshare_ready_fd = -EBADF;
2269 _cleanup_(sigkill_waitp) pid_t pid = 0;
2270 uint64_t c = 1;
2271 ssize_t n;
2272 int r;
2273
2274 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2275 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2276 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2277 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2278 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2279 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2280 * continues execution normally.
2281 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2282 * does not need CAP_SETUID to write the single line mapping to itself. */
2283
2284 /* Can only set up multiple mappings with CAP_SETUID. */
2285 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2286 r = asprintf(&uid_map,
2287 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2288 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2289 ouid, ouid, uid, uid);
2290 else
2291 r = asprintf(&uid_map,
2292 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2293 ouid, ouid);
2294
2295 if (r < 0)
2296 return -ENOMEM;
2297
2298 /* Can only set up multiple mappings with CAP_SETGID. */
2299 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2300 r = asprintf(&gid_map,
2301 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2302 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2303 ogid, ogid, gid, gid);
2304 else
2305 r = asprintf(&gid_map,
2306 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2307 ogid, ogid);
2308
2309 if (r < 0)
2310 return -ENOMEM;
2311
2312 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2313 * namespace. */
2314 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2315 if (unshare_ready_fd < 0)
2316 return -errno;
2317
2318 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2319 * failed. */
2320 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2321 return -errno;
2322
2323 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2324 if (r < 0)
2325 return r;
2326 if (r == 0) {
2327 _cleanup_close_ int fd = -EBADF;
2328 const char *a;
2329 pid_t ppid;
2330
2331 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2332 * here, after the parent opened its own user namespace. */
2333
2334 ppid = getppid();
2335 errno_pipe[0] = safe_close(errno_pipe[0]);
2336
2337 /* Wait until the parent unshared the user namespace */
2338 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2339 r = -errno;
2340 goto child_fail;
2341 }
2342
2343 /* Disable the setgroups() system call in the child user namespace, for good. */
2344 a = procfs_file_alloca(ppid, "setgroups");
2345 fd = open(a, O_WRONLY|O_CLOEXEC);
2346 if (fd < 0) {
2347 if (errno != ENOENT) {
2348 r = -errno;
2349 goto child_fail;
2350 }
2351
2352 /* If the file is missing the kernel is too old, let's continue anyway. */
2353 } else {
2354 if (write(fd, "deny\n", 5) < 0) {
2355 r = -errno;
2356 goto child_fail;
2357 }
2358
2359 fd = safe_close(fd);
2360 }
2361
2362 /* First write the GID map */
2363 a = procfs_file_alloca(ppid, "gid_map");
2364 fd = open(a, O_WRONLY|O_CLOEXEC);
2365 if (fd < 0) {
2366 r = -errno;
2367 goto child_fail;
2368 }
2369 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2370 r = -errno;
2371 goto child_fail;
2372 }
2373 fd = safe_close(fd);
2374
2375 /* The write the UID map */
2376 a = procfs_file_alloca(ppid, "uid_map");
2377 fd = open(a, O_WRONLY|O_CLOEXEC);
2378 if (fd < 0) {
2379 r = -errno;
2380 goto child_fail;
2381 }
2382 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2383 r = -errno;
2384 goto child_fail;
2385 }
2386
2387 _exit(EXIT_SUCCESS);
2388
2389 child_fail:
2390 (void) write(errno_pipe[1], &r, sizeof(r));
2391 _exit(EXIT_FAILURE);
2392 }
2393
2394 errno_pipe[1] = safe_close(errno_pipe[1]);
2395
2396 if (unshare(CLONE_NEWUSER) < 0)
2397 return -errno;
2398
2399 /* Let the child know that the namespace is ready now */
2400 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2401 return -errno;
2402
2403 /* Try to read an error code from the child */
2404 n = read(errno_pipe[0], &r, sizeof(r));
2405 if (n < 0)
2406 return -errno;
2407 if (n == sizeof(r)) { /* an error code was sent to us */
2408 if (r < 0)
2409 return r;
2410 return -EIO;
2411 }
2412 if (n != 0) /* on success we should have read 0 bytes */
2413 return -EIO;
2414
2415 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2416 if (r < 0)
2417 return r;
2418 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2419 return -EIO;
2420
2421 return 0;
2422 }
2423
2424 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2425 assert(context);
2426
2427 if (!context->dynamic_user)
2428 return false;
2429
2430 if (type == EXEC_DIRECTORY_CONFIGURATION)
2431 return false;
2432
2433 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2434 return false;
2435
2436 return true;
2437 }
2438
2439 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2440 _cleanup_free_ char *src_abs = NULL;
2441 int r;
2442
2443 assert(source);
2444
2445 src_abs = path_join(root, source);
2446 if (!src_abs)
2447 return -ENOMEM;
2448
2449 STRV_FOREACH(dst, symlinks) {
2450 _cleanup_free_ char *dst_abs = NULL;
2451
2452 dst_abs = path_join(root, *dst);
2453 if (!dst_abs)
2454 return -ENOMEM;
2455
2456 r = mkdir_parents_label(dst_abs, 0755);
2457 if (r < 0)
2458 return r;
2459
2460 r = symlink_idempotent(src_abs, dst_abs, true);
2461 if (r < 0)
2462 return r;
2463 }
2464
2465 return 0;
2466 }
2467
2468 static int setup_exec_directory(
2469 Unit *u,
2470 const ExecContext *context,
2471 const ExecParameters *params,
2472 uid_t uid,
2473 gid_t gid,
2474 ExecDirectoryType type,
2475 bool needs_mount_namespace,
2476 int *exit_status) {
2477
2478 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2479 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2480 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2481 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2482 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2483 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2484 };
2485 int r;
2486
2487 assert(context);
2488 assert(params);
2489 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2490 assert(exit_status);
2491
2492 if (!params->prefix[type])
2493 return 0;
2494
2495 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2496 if (!uid_is_valid(uid))
2497 uid = 0;
2498 if (!gid_is_valid(gid))
2499 gid = 0;
2500 }
2501
2502 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2503 _cleanup_free_ char *p = NULL, *pp = NULL;
2504
2505 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2506 if (!p) {
2507 r = -ENOMEM;
2508 goto fail;
2509 }
2510
2511 r = mkdir_parents_label(p, 0755);
2512 if (r < 0)
2513 goto fail;
2514
2515 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2516
2517 /* If we are in user mode, and a configuration directory exists but a state directory
2518 * doesn't exist, then we likely are upgrading from an older systemd version that
2519 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2520 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2521 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2522 * seperated. If a service has both dirs configured but only the configuration dir
2523 * exists and the state dir does not, we assume we are looking at an update
2524 * situation. Hence, create a compatibility symlink, so that all expectations are
2525 * met.
2526 *
2527 * (We also do something similar with the log directory, which still doesn't exist in
2528 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2529
2530 /* this assumes the state dir is always created before the configuration dir */
2531 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2532 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2533
2534 r = laccess(p, F_OK);
2535 if (r == -ENOENT) {
2536 _cleanup_free_ char *q = NULL;
2537
2538 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2539 * under the configuration hierarchy. */
2540
2541 if (type == EXEC_DIRECTORY_STATE)
2542 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2543 else if (type == EXEC_DIRECTORY_LOGS)
2544 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2545 else
2546 assert_not_reached();
2547 if (!q) {
2548 r = -ENOMEM;
2549 goto fail;
2550 }
2551
2552 r = laccess(q, F_OK);
2553 if (r >= 0) {
2554 /* It does exist! This hence looks like an update. Symlink the
2555 * configuration directory into the state directory. */
2556
2557 r = symlink_idempotent(q, p, /* make_relative= */ true);
2558 if (r < 0)
2559 goto fail;
2560
2561 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2562 continue;
2563 } else if (r != -ENOENT)
2564 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2565
2566 } else if (r < 0)
2567 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2568 }
2569
2570 if (exec_directory_is_private(context, type)) {
2571 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2572 * case we want to avoid leaving a directory around fully accessible that is owned by
2573 * a dynamic user whose UID is later on reused. To lock this down we use the same
2574 * trick used by container managers to prohibit host users to get access to files of
2575 * the same UID in containers: we place everything inside a directory that has an
2576 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2577 * for unprivileged host code. We then use fs namespacing to make this directory
2578 * permeable for the service itself.
2579 *
2580 * Specifically: for a service which wants a special directory "foo/" we first create
2581 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2582 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2583 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2584 * unprivileged host users can't look into it. Inside of the namespace of the unit
2585 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2586 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2587 * for the service and making sure it only gets access to the dirs it needs but no
2588 * others. Tricky? Yes, absolutely, but it works!
2589 *
2590 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2591 * to be owned by the service itself.
2592 *
2593 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2594 * for sharing files or sockets with other services. */
2595
2596 pp = path_join(params->prefix[type], "private");
2597 if (!pp) {
2598 r = -ENOMEM;
2599 goto fail;
2600 }
2601
2602 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2603 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2604 if (r < 0)
2605 goto fail;
2606
2607 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2608 r = -ENOMEM;
2609 goto fail;
2610 }
2611
2612 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2613 r = mkdir_parents_label(pp, 0755);
2614 if (r < 0)
2615 goto fail;
2616
2617 if (is_dir(p, false) > 0 &&
2618 (laccess(pp, F_OK) == -ENOENT)) {
2619
2620 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2621 * it over. Most likely the service has been upgraded from one that didn't use
2622 * DynamicUser=1, to one that does. */
2623
2624 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2625 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2626 exec_directory_type_to_string(type), p, pp);
2627
2628 r = RET_NERRNO(rename(p, pp));
2629 if (r < 0)
2630 goto fail;
2631 } else {
2632 /* Otherwise, create the actual directory for the service */
2633
2634 r = mkdir_label(pp, context->directories[type].mode);
2635 if (r < 0 && r != -EEXIST)
2636 goto fail;
2637 }
2638
2639 if (!context->directories[type].items[i].only_create) {
2640 /* And link it up from the original place.
2641 * Notes
2642 * 1) If a mount namespace is going to be used, then this symlink remains on
2643 * the host, and a new one for the child namespace will be created later.
2644 * 2) It is not necessary to create this symlink when one of its parent
2645 * directories is specified and already created. E.g.
2646 * StateDirectory=foo foo/bar
2647 * In that case, the inode points to pp and p for "foo/bar" are the same:
2648 * pp = "/var/lib/private/foo/bar"
2649 * p = "/var/lib/foo/bar"
2650 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2651 * we do not need to create the symlink, but we cannot create the symlink.
2652 * See issue #24783. */
2653 r = symlink_idempotent(pp, p, true);
2654 if (r < 0)
2655 goto fail;
2656 }
2657
2658 } else {
2659 _cleanup_free_ char *target = NULL;
2660
2661 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2662 readlink_and_make_absolute(p, &target) >= 0) {
2663 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2664
2665 /* This already exists and is a symlink? Interesting. Maybe it's one created
2666 * by DynamicUser=1 (see above)?
2667 *
2668 * We do this for all directory types except for ConfigurationDirectory=,
2669 * since they all support the private/ symlink logic at least in some
2670 * configurations, see above. */
2671
2672 r = chase(target, NULL, 0, &target_resolved, NULL);
2673 if (r < 0)
2674 goto fail;
2675
2676 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2677 if (!q) {
2678 r = -ENOMEM;
2679 goto fail;
2680 }
2681
2682 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2683 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2684 if (r < 0)
2685 goto fail;
2686
2687 if (path_equal(q_resolved, target_resolved)) {
2688
2689 /* Hmm, apparently DynamicUser= was once turned on for this service,
2690 * but is no longer. Let's move the directory back up. */
2691
2692 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2693 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2694 exec_directory_type_to_string(type), q, p);
2695
2696 r = RET_NERRNO(unlink(p));
2697 if (r < 0)
2698 goto fail;
2699
2700 r = RET_NERRNO(rename(q, p));
2701 if (r < 0)
2702 goto fail;
2703 }
2704 }
2705
2706 r = mkdir_label(p, context->directories[type].mode);
2707 if (r < 0) {
2708 if (r != -EEXIST)
2709 goto fail;
2710
2711 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2712 struct stat st;
2713
2714 /* Don't change the owner/access mode of the configuration directory,
2715 * as in the common case it is not written to by a service, and shall
2716 * not be writable. */
2717
2718 r = RET_NERRNO(stat(p, &st));
2719 if (r < 0)
2720 goto fail;
2721
2722 /* Still complain if the access mode doesn't match */
2723 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2724 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2725 "(File system: %o %sMode: %o)",
2726 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2727 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2728
2729 continue;
2730 }
2731 }
2732 }
2733
2734 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2735 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2736 * current UID/GID ownership.) */
2737 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2738 if (r < 0)
2739 goto fail;
2740
2741 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2742 * available to user code anyway */
2743 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2744 continue;
2745
2746 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2747 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2748 * assignments to exist. */
2749 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2750 if (r < 0)
2751 goto fail;
2752 }
2753
2754 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2755 * they are set up later, to allow configuring empty var/run/etc. */
2756 if (!needs_mount_namespace)
2757 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2758 r = create_many_symlinks(params->prefix[type],
2759 context->directories[type].items[i].path,
2760 context->directories[type].items[i].symlinks);
2761 if (r < 0)
2762 goto fail;
2763 }
2764
2765 return 0;
2766
2767 fail:
2768 *exit_status = exit_status_table[type];
2769 return r;
2770 }
2771
2772 static int write_credential(
2773 int dfd,
2774 const char *id,
2775 const void *data,
2776 size_t size,
2777 uid_t uid,
2778 bool ownership_ok) {
2779
2780 _cleanup_(unlink_and_freep) char *tmp = NULL;
2781 _cleanup_close_ int fd = -EBADF;
2782 int r;
2783
2784 r = tempfn_random_child("", "cred", &tmp);
2785 if (r < 0)
2786 return r;
2787
2788 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2789 if (fd < 0) {
2790 tmp = mfree(tmp);
2791 return -errno;
2792 }
2793
2794 r = loop_write(fd, data, size, /* do_poll = */ false);
2795 if (r < 0)
2796 return r;
2797
2798 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2799 return -errno;
2800
2801 if (uid_is_valid(uid) && uid != getuid()) {
2802 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2803 if (r < 0) {
2804 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2805 return r;
2806
2807 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2808 * to express: that the user gets read access and nothing
2809 * else. But if the backing fs can't support that (e.g. ramfs)
2810 * then we can use file ownership instead. But that's only safe if
2811 * we can then re-mount the whole thing read-only, so that the
2812 * user can no longer chmod() the file to gain write access. */
2813 return r;
2814
2815 if (fchown(fd, uid, GID_INVALID) < 0)
2816 return -errno;
2817 }
2818 }
2819
2820 if (renameat(dfd, tmp, dfd, id) < 0)
2821 return -errno;
2822
2823 tmp = mfree(tmp);
2824 return 0;
2825 }
2826
2827 typedef enum CredentialSearchPath {
2828 CREDENTIAL_SEARCH_PATH_TRUSTED,
2829 CREDENTIAL_SEARCH_PATH_ENCRYPTED,
2830 CREDENTIAL_SEARCH_PATH_ALL,
2831 _CREDENTIAL_SEARCH_PATH_MAX,
2832 _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
2833 } CredentialSearchPath;
2834
2835 static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
2836
2837 _cleanup_strv_free_ char **l = NULL;
2838
2839 assert(params);
2840 assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
2841
2842 /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2843 * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2844 * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2845
2846 if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2847 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2848 return NULL;
2849
2850 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2851 return NULL;
2852 }
2853
2854 if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2855 if (params->received_credentials_directory)
2856 if (strv_extend(&l, params->received_credentials_directory) < 0)
2857 return NULL;
2858
2859 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2860 return NULL;
2861 }
2862
2863 if (DEBUG_LOGGING) {
2864 _cleanup_free_ char *t = strv_join(l, ":");
2865
2866 log_debug("Credential search path is: %s", strempty(t));
2867 }
2868
2869 return TAKE_PTR(l);
2870 }
2871
2872 static int maybe_decrypt_and_write_credential(
2873 int dir_fd,
2874 const char *id,
2875 bool encrypted,
2876 uid_t uid,
2877 bool ownership_ok,
2878 const char *data,
2879 size_t size,
2880 uint64_t *left) {
2881
2882 _cleanup_free_ void *plaintext = NULL;
2883 size_t add;
2884 int r;
2885
2886 if (encrypted) {
2887 size_t plaintext_size = 0;
2888
2889 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
2890 &plaintext, &plaintext_size);
2891 if (r < 0)
2892 return r;
2893
2894 data = plaintext;
2895 size = plaintext_size;
2896 }
2897
2898 add = strlen(id) + size;
2899 if (add > *left)
2900 return -E2BIG;
2901
2902 r = write_credential(dir_fd, id, data, size, uid, ownership_ok);
2903 if (r < 0)
2904 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2905
2906 *left -= add;
2907 return 0;
2908 }
2909
2910 static int load_credential_glob(
2911 const char *path,
2912 bool encrypted,
2913 char **search_path,
2914 ReadFullFileFlags flags,
2915 int write_dfd,
2916 uid_t uid,
2917 bool ownership_ok,
2918 uint64_t *left) {
2919
2920 int r;
2921
2922 STRV_FOREACH(d, search_path) {
2923 _cleanup_globfree_ glob_t pglob = {};
2924 _cleanup_free_ char *j = NULL;
2925
2926 j = path_join(*d, path);
2927 if (!j)
2928 return -ENOMEM;
2929
2930 r = safe_glob(j, 0, &pglob);
2931 if (r == -ENOENT)
2932 continue;
2933 if (r < 0)
2934 return r;
2935
2936 for (size_t n = 0; n < pglob.gl_pathc; n++) {
2937 _cleanup_free_ char *fn = NULL;
2938 _cleanup_(erase_and_freep) char *data = NULL;
2939 size_t size;
2940
2941 /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2942 r = read_full_file_full(
2943 AT_FDCWD,
2944 pglob.gl_pathv[n],
2945 UINT64_MAX,
2946 encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2947 flags,
2948 NULL,
2949 &data, &size);
2950 if (r < 0)
2951 return log_debug_errno(r, "Failed to read credential '%s': %m",
2952 pglob.gl_pathv[n]);
2953
2954 r = path_extract_filename(pglob.gl_pathv[n], &fn);
2955 if (r < 0)
2956 return log_debug_errno(r, "Failed to extract filename from '%s': %m",
2957 pglob.gl_pathv[n]);
2958
2959 r = maybe_decrypt_and_write_credential(
2960 write_dfd,
2961 fn,
2962 encrypted,
2963 uid,
2964 ownership_ok,
2965 data, size,
2966 left);
2967 if (r == -EEXIST)
2968 continue;
2969 if (r < 0)
2970 return r;
2971 }
2972 }
2973
2974 return 0;
2975 }
2976
2977 static int load_credential(
2978 const ExecContext *context,
2979 const ExecParameters *params,
2980 const char *id,
2981 const char *path,
2982 bool encrypted,
2983 const char *unit,
2984 int read_dfd,
2985 int write_dfd,
2986 uid_t uid,
2987 bool ownership_ok,
2988 uint64_t *left) {
2989
2990 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2991 _cleanup_strv_free_ char **search_path = NULL;
2992 _cleanup_(erase_and_freep) char *data = NULL;
2993 _cleanup_free_ char *bindname = NULL;
2994 const char *source = NULL;
2995 bool missing_ok = true;
2996 size_t size, maxsz;
2997 int r;
2998
2999 assert(context);
3000 assert(params);
3001 assert(id);
3002 assert(path);
3003 assert(unit);
3004 assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
3005 assert(write_dfd >= 0);
3006 assert(left);
3007
3008 if (read_dfd >= 0) {
3009 /* If a directory fd is specified, then read the file directly from that dir. In this case we
3010 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
3011 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
3012 * open it. */
3013
3014 if (!filename_is_valid(path)) /* safety check */
3015 return -EINVAL;
3016
3017 missing_ok = true;
3018 source = path;
3019
3020 } else if (path_is_absolute(path)) {
3021 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
3022 * sockets */
3023
3024 if (!path_is_valid(path)) /* safety check */
3025 return -EINVAL;
3026
3027 flags |= READ_FULL_FILE_CONNECT_SOCKET;
3028
3029 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3030 * via the source socket address in case we read off an AF_UNIX socket. */
3031 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3032 return -ENOMEM;
3033
3034 missing_ok = false;
3035 source = path;
3036
3037 } else if (credential_name_valid(path)) {
3038 /* If this is a relative path, take it as credential name relative to the credentials
3039 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3040 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3041
3042 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
3043 if (!search_path)
3044 return -ENOMEM;
3045
3046 missing_ok = true;
3047 } else
3048 source = NULL;
3049
3050 if (encrypted)
3051 flags |= READ_FULL_FILE_UNBASE64;
3052
3053 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
3054
3055 if (search_path) {
3056 STRV_FOREACH(d, search_path) {
3057 _cleanup_free_ char *j = NULL;
3058
3059 j = path_join(*d, path);
3060 if (!j)
3061 return -ENOMEM;
3062
3063 r = read_full_file_full(
3064 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3065 UINT64_MAX,
3066 maxsz,
3067 flags,
3068 NULL,
3069 &data, &size);
3070 if (r != -ENOENT)
3071 break;
3072 }
3073 } else if (source)
3074 r = read_full_file_full(
3075 read_dfd, source,
3076 UINT64_MAX,
3077 maxsz,
3078 flags,
3079 bindname,
3080 &data, &size);
3081 else
3082 r = -ENOENT;
3083
3084 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3085 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3086 * will get clear errors if we don't pass such a missing credential on as they
3087 * themselves will get ENOENT when trying to read them, which should not be much
3088 * worse than when we handle the error here and make it fatal.
3089 *
3090 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3091 * we are fine, too. */
3092 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3093 return 0;
3094 }
3095 if (r < 0)
3096 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3097
3098 return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, ownership_ok, data, size, left);
3099 }
3100
3101 struct load_cred_args {
3102 const ExecContext *context;
3103 const ExecParameters *params;
3104 bool encrypted;
3105 const char *unit;
3106 int dfd;
3107 uid_t uid;
3108 bool ownership_ok;
3109 uint64_t *left;
3110 };
3111
3112 static int load_cred_recurse_dir_cb(
3113 RecurseDirEvent event,
3114 const char *path,
3115 int dir_fd,
3116 int inode_fd,
3117 const struct dirent *de,
3118 const struct statx *sx,
3119 void *userdata) {
3120
3121 struct load_cred_args *args = ASSERT_PTR(userdata);
3122 _cleanup_free_ char *sub_id = NULL;
3123 int r;
3124
3125 if (event != RECURSE_DIR_ENTRY)
3126 return RECURSE_DIR_CONTINUE;
3127
3128 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
3129 return RECURSE_DIR_CONTINUE;
3130
3131 sub_id = strreplace(path, "/", "_");
3132 if (!sub_id)
3133 return -ENOMEM;
3134
3135 if (!credential_name_valid(sub_id))
3136 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3137
3138 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3139 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
3140 return RECURSE_DIR_CONTINUE;
3141 }
3142 if (errno != ENOENT)
3143 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3144
3145 r = load_credential(
3146 args->context,
3147 args->params,
3148 sub_id,
3149 de->d_name,
3150 args->encrypted,
3151 args->unit,
3152 dir_fd,
3153 args->dfd,
3154 args->uid,
3155 args->ownership_ok,
3156 args->left);
3157 if (r < 0)
3158 return r;
3159
3160 return RECURSE_DIR_CONTINUE;
3161 }
3162
3163 static int acquire_credentials(
3164 const ExecContext *context,
3165 const ExecParameters *params,
3166 const char *unit,
3167 const char *p,
3168 uid_t uid,
3169 bool ownership_ok) {
3170
3171 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
3172 _cleanup_close_ int dfd = -EBADF;
3173 const char *ic;
3174 ExecLoadCredential *lc;
3175 ExecSetCredential *sc;
3176 int r;
3177
3178 assert(context);
3179 assert(p);
3180
3181 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
3182 if (dfd < 0)
3183 return -errno;
3184
3185 r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3186 if (r < 0)
3187 return r;
3188
3189 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3190 HASHMAP_FOREACH(lc, context->load_credentials) {
3191 _cleanup_close_ int sub_fd = -EBADF;
3192
3193 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3194 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3195 * a regular file. Finally, if it's a relative path we will use it as a credential name to
3196 * propagate a credential passed to us from further up. */
3197
3198 if (path_is_absolute(lc->path)) {
3199 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
3200 if (sub_fd < 0 && !IN_SET(errno,
3201 ENOTDIR, /* Not a directory */
3202 ENOENT)) /* Doesn't exist? */
3203 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
3204 }
3205
3206 if (sub_fd < 0)
3207 /* Regular file (incl. a credential passed in from higher up) */
3208 r = load_credential(
3209 context,
3210 params,
3211 lc->id,
3212 lc->path,
3213 lc->encrypted,
3214 unit,
3215 AT_FDCWD,
3216 dfd,
3217 uid,
3218 ownership_ok,
3219 &left);
3220 else
3221 /* Directory */
3222 r = recurse_dir(
3223 sub_fd,
3224 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3225 /* statx_mask= */ 0,
3226 /* n_depth_max= */ UINT_MAX,
3227 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3228 load_cred_recurse_dir_cb,
3229 &(struct load_cred_args) {
3230 .context = context,
3231 .params = params,
3232 .encrypted = lc->encrypted,
3233 .unit = unit,
3234 .dfd = dfd,
3235 .uid = uid,
3236 .ownership_ok = ownership_ok,
3237 .left = &left,
3238 });
3239 if (r < 0)
3240 return r;
3241 }
3242
3243 /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3244 * override any credentials found earlier. */
3245 SET_FOREACH(ic, context->import_credentials) {
3246 _cleanup_free_ char **search_path = NULL;
3247
3248 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
3249 if (!search_path)
3250 return -ENOMEM;
3251
3252 r = load_credential_glob(
3253 ic,
3254 /* encrypted = */ false,
3255 search_path,
3256 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
3257 dfd,
3258 uid,
3259 ownership_ok,
3260 &left);
3261 if (r < 0)
3262 return r;
3263
3264 search_path = strv_free(search_path);
3265 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
3266 if (!search_path)
3267 return -ENOMEM;
3268
3269 r = load_credential_glob(
3270 ic,
3271 /* encrypted = */ true,
3272 search_path,
3273 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
3274 dfd,
3275 uid,
3276 ownership_ok,
3277 &left);
3278 if (r < 0)
3279 return r;
3280 }
3281
3282 /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3283 * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3284 HASHMAP_FOREACH(sc, context->set_credentials) {
3285 _cleanup_(erase_and_freep) void *plaintext = NULL;
3286 const char *data;
3287 size_t size, add;
3288
3289 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3290 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3291 * slow and involved, hence it's nice to be able to skip that if the credential already
3292 * exists anyway. */
3293 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
3294 continue;
3295 if (errno != ENOENT)
3296 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
3297
3298 if (sc->encrypted) {
3299 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
3300 if (r < 0)
3301 return r;
3302
3303 data = plaintext;
3304 } else {
3305 data = sc->data;
3306 size = sc->size;
3307 }
3308
3309 add = strlen(sc->id) + size;
3310 if (add > left)
3311 return -E2BIG;
3312
3313 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
3314 if (r < 0)
3315 return r;
3316
3317 left -= add;
3318 }
3319
3320 r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
3321 if (r < 0)
3322 return r;
3323
3324 /* After we created all keys with the right perms, also make sure the credential store as a whole is
3325 * accessible */
3326
3327 if (uid_is_valid(uid) && uid != getuid()) {
3328 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
3329 if (r < 0) {
3330 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3331 return r;
3332
3333 if (!ownership_ok)
3334 return r;
3335
3336 if (fchown(dfd, uid, GID_INVALID) < 0)
3337 return -errno;
3338 }
3339 }
3340
3341 return 0;
3342 }
3343
3344 static int setup_credentials_internal(
3345 const ExecContext *context,
3346 const ExecParameters *params,
3347 const char *unit,
3348 const char *final, /* This is where the credential store shall eventually end up at */
3349 const char *workspace, /* This is where we can prepare it before moving it to the final place */
3350 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
3351 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3352 uid_t uid) {
3353
3354 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3355 * if we mounted something; false if we definitely can't mount anything */
3356 bool final_mounted;
3357 const char *where;
3358
3359 assert(context);
3360 assert(final);
3361 assert(workspace);
3362
3363 if (reuse_workspace) {
3364 r = path_is_mount_point(workspace, NULL, 0);
3365 if (r < 0)
3366 return r;
3367 if (r > 0)
3368 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3369 else
3370 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3371 } else
3372 workspace_mounted = -1; /* ditto */
3373
3374 r = path_is_mount_point(final, NULL, 0);
3375 if (r < 0)
3376 return r;
3377 if (r > 0) {
3378 /* If the final place already has something mounted, we use that. If the workspace also has
3379 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3380 * different). */
3381 final_mounted = true;
3382
3383 if (workspace_mounted < 0) {
3384 /* If the final place is mounted, but the workspace isn't, then let's bind mount
3385 * the final version to the workspace, and make it writable, so that we can make
3386 * changes */
3387
3388 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3389 if (r < 0)
3390 return r;
3391
3392 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3393 if (r < 0)
3394 return r;
3395
3396 workspace_mounted = true;
3397 }
3398 } else
3399 final_mounted = false;
3400
3401 if (workspace_mounted < 0) {
3402 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3403
3404 r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
3405 if (r < 0) {
3406 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3407 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3408 if (r < 0) {
3409 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3410 return r;
3411
3412 if (must_mount) /* If we it's not OK to use the plain directory
3413 * fallback, propagate all errors too */
3414 return r;
3415
3416 /* If we lack privileges to bind mount stuff, then let's gracefully
3417 * proceed for compat with container envs, and just use the final dir
3418 * as is. */
3419
3420 workspace_mounted = false;
3421 } else {
3422 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3423 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3424 if (r < 0)
3425 return r;
3426
3427 workspace_mounted = true;
3428 }
3429 } else
3430 workspace_mounted = true;
3431 }
3432
3433 assert(!must_mount || workspace_mounted > 0);
3434 where = workspace_mounted ? workspace : final;
3435
3436 (void) label_fix_full(AT_FDCWD, where, final, 0);
3437
3438 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3439 if (r < 0)
3440 return r;
3441
3442 if (workspace_mounted) {
3443 bool install;
3444
3445 /* Determine if we should actually install the prepared mount in the final location by bind
3446 * mounting it there. We do so only if the mount is not established there already, and if the
3447 * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3448 * case we are doing all this in a mount namespace, thus no one else will see that we
3449 * allocated a file system we are getting rid of again here. */
3450 if (final_mounted)
3451 install = false; /* already installed */
3452 else {
3453 r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
3454 if (r < 0)
3455 return r;
3456
3457 install = r == 0; /* install only if non-empty */
3458 }
3459
3460 if (install) {
3461 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3462 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
3463 if (r < 0)
3464 return r;
3465
3466 /* And mount it to the final place, read-only */
3467 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3468 } else
3469 /* Otherwise get rid of it */
3470 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3471 if (r < 0)
3472 return r;
3473 } else {
3474 _cleanup_free_ char *parent = NULL;
3475
3476 /* If we do not have our own mount put used the plain directory fallback, then we need to
3477 * open access to the top-level credential directory and the per-service directory now */
3478
3479 r = path_extract_directory(final, &parent);
3480 if (r < 0)
3481 return r;
3482 if (chmod(parent, 0755) < 0)
3483 return -errno;
3484 }
3485
3486 return 0;
3487 }
3488
3489 static int setup_credentials(
3490 const ExecContext *context,
3491 const ExecParameters *params,
3492 const char *unit,
3493 uid_t uid) {
3494
3495 _cleanup_free_ char *p = NULL, *q = NULL;
3496 int r;
3497
3498 assert(context);
3499 assert(params);
3500
3501 if (!exec_context_has_credentials(context))
3502 return 0;
3503
3504 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3505 return -EINVAL;
3506
3507 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3508 * and the subdir we mount over with a read-only file system readable by the service's user */
3509 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3510 if (!q)
3511 return -ENOMEM;
3512
3513 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3514 if (r < 0 && r != -EEXIST)
3515 return r;
3516
3517 p = path_join(q, unit);
3518 if (!p)
3519 return -ENOMEM;
3520
3521 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3522 if (r < 0 && r != -EEXIST)
3523 return r;
3524
3525 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3526 if (r < 0) {
3527 _cleanup_free_ char *t = NULL, *u = NULL;
3528
3529 /* If this is not a privilege or support issue then propagate the error */
3530 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3531 return r;
3532
3533 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3534 * it into place, so that users can't access half-initialized credential stores. */
3535 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3536 if (!t)
3537 return -ENOMEM;
3538
3539 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3540 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3541 * after it is fully set up */
3542 u = path_join(t, unit);
3543 if (!u)
3544 return -ENOMEM;
3545
3546 FOREACH_STRING(i, t, u) {
3547 r = mkdir_label(i, 0700);
3548 if (r < 0 && r != -EEXIST)
3549 return r;
3550 }
3551
3552 r = setup_credentials_internal(
3553 context,
3554 params,
3555 unit,
3556 p, /* final mount point */
3557 u, /* temporary workspace to overmount */
3558 true, /* reuse the workspace if it is already a mount */
3559 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3560 uid);
3561
3562 (void) rmdir(u); /* remove the workspace again if we can. */
3563
3564 if (r < 0)
3565 return r;
3566
3567 } else if (r == 0) {
3568
3569 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3570 * we can use the same directory for all cases, after turning off propagation. Question
3571 * though is: where do we turn off propagation exactly, and where do we place the workspace
3572 * directory? We need some place that is guaranteed to be a mount point in the host, and
3573 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3574 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3575 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3576 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3577 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3578 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3579 * propagation on the former, and then overmount the latter.
3580 *
3581 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3582 * for this purpose, but there are few other candidates that work equally well for us, and
3583 * given that the we do this in a privately namespaced short-lived single-threaded process
3584 * that no one else sees this should be OK to do. */
3585
3586 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3587 if (r < 0)
3588 goto child_fail;
3589
3590 r = setup_credentials_internal(
3591 context,
3592 params,
3593 unit,
3594 p, /* final mount point */
3595 "/dev/shm", /* temporary workspace to overmount */
3596 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3597 true, /* insist that something is mounted, do not allow fallback to plain directory */
3598 uid);
3599 if (r < 0)
3600 goto child_fail;
3601
3602 _exit(EXIT_SUCCESS);
3603
3604 child_fail:
3605 _exit(EXIT_FAILURE);
3606 }
3607
3608 /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3609 * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3610 * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3611 * seen by users when trying access this inode. */
3612 (void) rmdir(p);
3613 return 0;
3614 }
3615
3616 #if ENABLE_SMACK
3617 static int setup_smack(
3618 const Manager *manager,
3619 const ExecContext *context,
3620 int executable_fd) {
3621 int r;
3622
3623 assert(context);
3624 assert(executable_fd >= 0);
3625
3626 if (context->smack_process_label) {
3627 r = mac_smack_apply_pid(0, context->smack_process_label);
3628 if (r < 0)
3629 return r;
3630 } else if (manager->default_smack_process_label) {
3631 _cleanup_free_ char *exec_label = NULL;
3632
3633 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3634 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3635 return r;
3636
3637 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
3638 if (r < 0)
3639 return r;
3640 }
3641
3642 return 0;
3643 }
3644 #endif
3645
3646 static int compile_bind_mounts(
3647 const ExecContext *context,
3648 const ExecParameters *params,
3649 BindMount **ret_bind_mounts,
3650 size_t *ret_n_bind_mounts,
3651 char ***ret_empty_directories) {
3652
3653 _cleanup_strv_free_ char **empty_directories = NULL;
3654 BindMount *bind_mounts = NULL;
3655 size_t n, h = 0;
3656 int r;
3657
3658 assert(context);
3659 assert(params);
3660 assert(ret_bind_mounts);
3661 assert(ret_n_bind_mounts);
3662 assert(ret_empty_directories);
3663
3664 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
3665
3666 n = context->n_bind_mounts;
3667 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3668 if (!params->prefix[t])
3669 continue;
3670
3671 for (size_t i = 0; i < context->directories[t].n_items; i++)
3672 n += !context->directories[t].items[i].only_create;
3673 }
3674
3675 if (n <= 0) {
3676 *ret_bind_mounts = NULL;
3677 *ret_n_bind_mounts = 0;
3678 *ret_empty_directories = NULL;
3679 return 0;
3680 }
3681
3682 bind_mounts = new(BindMount, n);
3683 if (!bind_mounts)
3684 return -ENOMEM;
3685
3686 for (size_t i = 0; i < context->n_bind_mounts; i++) {
3687 BindMount *item = context->bind_mounts + i;
3688 _cleanup_free_ char *s = NULL, *d = NULL;
3689
3690 s = strdup(item->source);
3691 if (!s)
3692 return -ENOMEM;
3693
3694 d = strdup(item->destination);
3695 if (!d)
3696 return -ENOMEM;
3697
3698 bind_mounts[h++] = (BindMount) {
3699 .source = TAKE_PTR(s),
3700 .destination = TAKE_PTR(d),
3701 .read_only = item->read_only,
3702 .recursive = item->recursive,
3703 .ignore_enoent = item->ignore_enoent,
3704 };
3705 }
3706
3707 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3708 if (!params->prefix[t])
3709 continue;
3710
3711 if (context->directories[t].n_items == 0)
3712 continue;
3713
3714 if (exec_directory_is_private(context, t) &&
3715 !exec_context_with_rootfs(context)) {
3716 char *private_root;
3717
3718 /* So this is for a dynamic user, and we need to make sure the process can access its own
3719 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3720 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3721
3722 private_root = path_join(params->prefix[t], "private");
3723 if (!private_root)
3724 return -ENOMEM;
3725
3726 r = strv_consume(&empty_directories, private_root);
3727 if (r < 0)
3728 return r;
3729 }
3730
3731 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3732 _cleanup_free_ char *s = NULL, *d = NULL;
3733
3734 /* When one of the parent directories is in the list, we cannot create the symlink
3735 * for the child directory. See also the comments in setup_exec_directory(). */
3736 if (context->directories[t].items[i].only_create)
3737 continue;
3738
3739 if (exec_directory_is_private(context, t))
3740 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3741 else
3742 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3743 if (!s)
3744 return -ENOMEM;
3745
3746 if (exec_directory_is_private(context, t) &&
3747 exec_context_with_rootfs(context))
3748 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3749 * directory is not created on the root directory. So, let's bind-mount the directory
3750 * on the 'non-private' place. */
3751 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3752 else
3753 d = strdup(s);
3754 if (!d)
3755 return -ENOMEM;
3756
3757 bind_mounts[h++] = (BindMount) {
3758 .source = TAKE_PTR(s),
3759 .destination = TAKE_PTR(d),
3760 .read_only = false,
3761 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3762 .recursive = true,
3763 .ignore_enoent = false,
3764 };
3765 }
3766 }
3767
3768 assert(h == n);
3769
3770 *ret_bind_mounts = TAKE_PTR(bind_mounts);
3771 *ret_n_bind_mounts = n;
3772 *ret_empty_directories = TAKE_PTR(empty_directories);
3773
3774 return (int) n;
3775 }
3776
3777 /* ret_symlinks will contain a list of pairs src:dest that describes
3778 * the symlinks to create later on. For example, the symlinks needed
3779 * to safely give private directories to DynamicUser=1 users. */
3780 static int compile_symlinks(
3781 const ExecContext *context,
3782 const ExecParameters *params,
3783 char ***ret_symlinks) {
3784
3785 _cleanup_strv_free_ char **symlinks = NULL;
3786 int r;
3787
3788 assert(context);
3789 assert(params);
3790 assert(ret_symlinks);
3791
3792 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3793 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3794 _cleanup_free_ char *private_path = NULL, *path = NULL;
3795
3796 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3797 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3798
3799 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3800 dst_abs = path_join(params->prefix[dt], *symlink);
3801 if (!src_abs || !dst_abs)
3802 return -ENOMEM;
3803
3804 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3805 if (r < 0)
3806 return r;
3807 }
3808
3809 if (!exec_directory_is_private(context, dt) ||
3810 exec_context_with_rootfs(context) ||
3811 context->directories[dt].items[i].only_create)
3812 continue;
3813
3814 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3815 if (!private_path)
3816 return -ENOMEM;
3817
3818 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3819 if (!path)
3820 return -ENOMEM;
3821
3822 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3823 if (r < 0)
3824 return r;
3825 }
3826 }
3827
3828 *ret_symlinks = TAKE_PTR(symlinks);
3829
3830 return 0;
3831 }
3832
3833 static bool insist_on_sandboxing(
3834 const ExecContext *context,
3835 const char *root_dir,
3836 const char *root_image,
3837 const BindMount *bind_mounts,
3838 size_t n_bind_mounts) {
3839
3840 assert(context);
3841 assert(n_bind_mounts == 0 || bind_mounts);
3842
3843 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3844 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3845 * rearrange stuff in a way we cannot ignore gracefully. */
3846
3847 if (context->n_temporary_filesystems > 0)
3848 return true;
3849
3850 if (root_dir || root_image)
3851 return true;
3852
3853 if (context->n_mount_images > 0)
3854 return true;
3855
3856 if (context->dynamic_user)
3857 return true;
3858
3859 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3860 return true;
3861
3862 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3863 * essential. */
3864 for (size_t i = 0; i < n_bind_mounts; i++)
3865 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3866 return true;
3867
3868 if (context->log_namespace)
3869 return true;
3870
3871 return false;
3872 }
3873
3874 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3875 _cleanup_close_ int fd = -EBADF;
3876 int r;
3877
3878 if (!runtime || !runtime->ephemeral_copy)
3879 return 0;
3880
3881 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3882 if (r < 0)
3883 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3884
3885 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3886
3887 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3888 if (fd >= 0)
3889 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3890 return 0;
3891
3892 if (fd != -EAGAIN)
3893 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3894
3895 log_debug("Making ephemeral snapshot of %s to %s",
3896 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3897
3898 if (context->root_image)
3899 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3900 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3901 else
3902 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3903 AT_FDCWD, runtime->ephemeral_copy,
3904 BTRFS_SNAPSHOT_FALLBACK_COPY |
3905 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3906 BTRFS_SNAPSHOT_RECURSIVE |
3907 BTRFS_SNAPSHOT_LOCK_BSD);
3908 if (fd < 0)
3909 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3910 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3911
3912 if (context->root_image) {
3913 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3914 * which tends to not perform well in combination with lots of random writes.
3915 *
3916 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3917 * copy, but we at least want to make the intention clear.
3918 */
3919 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3920 if (r < 0)
3921 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3922 }
3923
3924 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3925 if (r < 0)
3926 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3927
3928 return 1;
3929 }
3930
3931 static int verity_settings_prepare(
3932 VeritySettings *verity,
3933 const char *root_image,
3934 const void *root_hash,
3935 size_t root_hash_size,
3936 const char *root_hash_path,
3937 const void *root_hash_sig,
3938 size_t root_hash_sig_size,
3939 const char *root_hash_sig_path,
3940 const char *verity_data_path) {
3941
3942 int r;
3943
3944 assert(verity);
3945
3946 if (root_hash) {
3947 void *d;
3948
3949 d = memdup(root_hash, root_hash_size);
3950 if (!d)
3951 return -ENOMEM;
3952
3953 free_and_replace(verity->root_hash, d);
3954 verity->root_hash_size = root_hash_size;
3955 verity->designator = PARTITION_ROOT;
3956 }
3957
3958 if (root_hash_sig) {
3959 void *d;
3960
3961 d = memdup(root_hash_sig, root_hash_sig_size);
3962 if (!d)
3963 return -ENOMEM;
3964
3965 free_and_replace(verity->root_hash_sig, d);
3966 verity->root_hash_sig_size = root_hash_sig_size;
3967 verity->designator = PARTITION_ROOT;
3968 }
3969
3970 if (verity_data_path) {
3971 r = free_and_strdup(&verity->data_path, verity_data_path);
3972 if (r < 0)
3973 return r;
3974 }
3975
3976 r = verity_settings_load(
3977 verity,
3978 root_image,
3979 root_hash_path,
3980 root_hash_sig_path);
3981 if (r < 0)
3982 return log_debug_errno(r, "Failed to load root hash: %m");
3983
3984 return 0;
3985 }
3986
3987 static int apply_mount_namespace(
3988 const Unit *u,
3989 ExecCommandFlags command_flags,
3990 const ExecContext *context,
3991 const ExecParameters *params,
3992 ExecRuntime *runtime,
3993 const char *memory_pressure_path,
3994 char **error_path) {
3995
3996 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3997 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3998 **read_write_paths_cleanup = NULL;
3999 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
4000 *extension_dir = NULL;
4001 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4002 char **read_write_paths;
4003 NamespaceInfo ns_info;
4004 bool needs_sandboxing;
4005 BindMount *bind_mounts = NULL;
4006 size_t n_bind_mounts = 0;
4007 int r;
4008
4009 assert(context);
4010
4011 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
4012
4013 if (params->flags & EXEC_APPLY_CHROOT) {
4014 r = setup_ephemeral(context, runtime);
4015 if (r < 0)
4016 return r;
4017
4018 if (context->root_image)
4019 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
4020 else
4021 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
4022 }
4023
4024 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
4025 if (r < 0)
4026 return r;
4027
4028 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
4029 r = compile_symlinks(context, params, &symlinks);
4030 if (r < 0)
4031 return r;
4032
4033 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4034 * service will need to write to it in order to start the notifications. */
4035 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
4036 read_write_paths_cleanup = strv_copy(context->read_write_paths);
4037 if (!read_write_paths_cleanup)
4038 return -ENOMEM;
4039
4040 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
4041 if (r < 0)
4042 return r;
4043
4044 read_write_paths = read_write_paths_cleanup;
4045 } else
4046 read_write_paths = context->read_write_paths;
4047
4048 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4049 if (needs_sandboxing) {
4050 /* The runtime struct only contains the parent of the private /tmp,
4051 * which is non-accessible to world users. Inside of it there's a /tmp
4052 * that is sticky, and that's the one we want to use here.
4053 * This does not apply when we are using /run/systemd/empty as fallback. */
4054
4055 if (context->private_tmp && runtime && runtime->shared) {
4056 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
4057 tmp_dir = runtime->shared->tmp_dir;
4058 else if (runtime->shared->tmp_dir)
4059 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
4060
4061 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
4062 var_tmp_dir = runtime->shared->var_tmp_dir;
4063 else if (runtime->shared->var_tmp_dir)
4064 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
4065 }
4066
4067 ns_info = (NamespaceInfo) {
4068 .ignore_protect_paths = false,
4069 .private_dev = context->private_devices,
4070 .protect_control_groups = context->protect_control_groups,
4071 .protect_kernel_tunables = context->protect_kernel_tunables,
4072 .protect_kernel_modules = context->protect_kernel_modules,
4073 .protect_kernel_logs = context->protect_kernel_logs,
4074 .protect_hostname = context->protect_hostname,
4075 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
4076 .protect_home = context->protect_home,
4077 .protect_system = context->protect_system,
4078 .protect_proc = context->protect_proc,
4079 .proc_subset = context->proc_subset,
4080 .private_network = exec_needs_network_namespace(context),
4081 .private_ipc = exec_needs_ipc_namespace(context),
4082 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4083 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
4084 };
4085 } else if (!context->dynamic_user && root_dir)
4086 /*
4087 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4088 * sandbox info, otherwise enforce it, don't ignore protected paths and
4089 * fail if we are enable to apply the sandbox inside the mount namespace.
4090 */
4091 ns_info = (NamespaceInfo) {
4092 .ignore_protect_paths = true,
4093 };
4094 else
4095 ns_info = (NamespaceInfo) {};
4096
4097 if (context->mount_propagation_flag == MS_SHARED)
4098 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4099
4100 if (exec_context_has_credentials(context) &&
4101 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
4102 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4103 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
4104 if (!creds_path)
4105 return -ENOMEM;
4106 }
4107
4108 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
4109 propagate_dir = path_join("/run/systemd/propagate/", u->id);
4110 if (!propagate_dir)
4111 return -ENOMEM;
4112
4113 incoming_dir = strdup("/run/systemd/incoming");
4114 if (!incoming_dir)
4115 return -ENOMEM;
4116
4117 extension_dir = strdup("/run/systemd/unit-extensions");
4118 if (!extension_dir)
4119 return -ENOMEM;
4120 } else {
4121 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
4122
4123 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
4124 return -ENOMEM;
4125 }
4126
4127 if (root_image) {
4128 r = verity_settings_prepare(
4129 &verity,
4130 root_image,
4131 context->root_hash, context->root_hash_size, context->root_hash_path,
4132 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
4133 context->root_verity);
4134 if (r < 0)
4135 return r;
4136 }
4137
4138 r = setup_namespace(
4139 root_dir,
4140 root_image,
4141 context->root_image_options,
4142 context->root_image_policy ?: &image_policy_service,
4143 &ns_info,
4144 read_write_paths,
4145 needs_sandboxing ? context->read_only_paths : NULL,
4146 needs_sandboxing ? context->inaccessible_paths : NULL,
4147 needs_sandboxing ? context->exec_paths : NULL,
4148 needs_sandboxing ? context->no_exec_paths : NULL,
4149 empty_directories,
4150 symlinks,
4151 bind_mounts,
4152 n_bind_mounts,
4153 context->temporary_filesystems,
4154 context->n_temporary_filesystems,
4155 context->mount_images,
4156 context->n_mount_images,
4157 context->mount_image_policy ?: &image_policy_service,
4158 tmp_dir,
4159 var_tmp_dir,
4160 creds_path,
4161 context->log_namespace,
4162 context->mount_propagation_flag,
4163 &verity,
4164 context->extension_images,
4165 context->n_extension_images,
4166 context->extension_image_policy ?: &image_policy_sysext,
4167 context->extension_directories,
4168 propagate_dir,
4169 incoming_dir,
4170 extension_dir,
4171 root_dir || root_image ? params->notify_socket : NULL,
4172 error_path);
4173
4174 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4175 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4176 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4177 * completely different execution environment. */
4178 if (r == -ENOANO) {
4179 if (insist_on_sandboxing(
4180 context,
4181 root_dir, root_image,
4182 bind_mounts,
4183 n_bind_mounts))
4184 return log_unit_debug_errno(u,
4185 SYNTHETIC_ERRNO(EOPNOTSUPP),
4186 "Failed to set up namespace, and refusing to continue since "
4187 "the selected namespacing options alter mount environment non-trivially.\n"
4188 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4189 n_bind_mounts,
4190 context->n_temporary_filesystems,
4191 yes_no(root_dir),
4192 yes_no(root_image),
4193 yes_no(context->dynamic_user));
4194
4195 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4196 return 0;
4197 }
4198
4199 return r;
4200 }
4201
4202 static int apply_working_directory(
4203 const ExecContext *context,
4204 const ExecParameters *params,
4205 ExecRuntime *runtime,
4206 const char *home,
4207 int *exit_status) {
4208
4209 const char *d, *wd;
4210
4211 assert(context);
4212 assert(exit_status);
4213
4214 if (context->working_directory_home) {
4215
4216 if (!home) {
4217 *exit_status = EXIT_CHDIR;
4218 return -ENXIO;
4219 }
4220
4221 wd = home;
4222
4223 } else
4224 wd = empty_to_root(context->working_directory);
4225
4226 if (params->flags & EXEC_APPLY_CHROOT)
4227 d = wd;
4228 else
4229 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
4230
4231 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
4232 *exit_status = EXIT_CHDIR;
4233 return -errno;
4234 }
4235
4236 return 0;
4237 }
4238
4239 static int apply_root_directory(
4240 const ExecContext *context,
4241 const ExecParameters *params,
4242 ExecRuntime *runtime,
4243 const bool needs_mount_ns,
4244 int *exit_status) {
4245
4246 assert(context);
4247 assert(exit_status);
4248
4249 if (params->flags & EXEC_APPLY_CHROOT)
4250 if (!needs_mount_ns && context->root_directory)
4251 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
4252 *exit_status = EXIT_CHROOT;
4253 return -errno;
4254 }
4255
4256 return 0;
4257 }
4258
4259 static int setup_keyring(
4260 const Unit *u,
4261 const ExecContext *context,
4262 const ExecParameters *p,
4263 uid_t uid, gid_t gid) {
4264
4265 key_serial_t keyring;
4266 int r = 0;
4267 uid_t saved_uid;
4268 gid_t saved_gid;
4269
4270 assert(u);
4271 assert(context);
4272 assert(p);
4273
4274 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4275 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4276 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4277 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4278 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4279 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4280
4281 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
4282 return 0;
4283
4284 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4285 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4286 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4287 * & group is just as nasty as acquiring a reference to the user keyring. */
4288
4289 saved_uid = getuid();
4290 saved_gid = getgid();
4291
4292 if (gid_is_valid(gid) && gid != saved_gid) {
4293 if (setregid(gid, -1) < 0)
4294 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
4295 }
4296
4297 if (uid_is_valid(uid) && uid != saved_uid) {
4298 if (setreuid(uid, -1) < 0) {
4299 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
4300 goto out;
4301 }
4302 }
4303
4304 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
4305 if (keyring == -1) {
4306 if (errno == ENOSYS)
4307 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
4308 else if (ERRNO_IS_PRIVILEGE(errno))
4309 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
4310 else if (errno == EDQUOT)
4311 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
4312 else
4313 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
4314
4315 goto out;
4316 }
4317
4318 /* When requested link the user keyring into the session keyring. */
4319 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
4320
4321 if (keyctl(KEYCTL_LINK,
4322 KEY_SPEC_USER_KEYRING,
4323 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
4324 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
4325 goto out;
4326 }
4327 }
4328
4329 /* Restore uid/gid back */
4330 if (uid_is_valid(uid) && uid != saved_uid) {
4331 if (setreuid(saved_uid, -1) < 0) {
4332 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
4333 goto out;
4334 }
4335 }
4336
4337 if (gid_is_valid(gid) && gid != saved_gid) {
4338 if (setregid(saved_gid, -1) < 0)
4339 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
4340 }
4341
4342 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4343 if (!sd_id128_is_null(u->invocation_id)) {
4344 key_serial_t key;
4345
4346 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
4347 if (key == -1)
4348 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
4349 else {
4350 if (keyctl(KEYCTL_SETPERM, key,
4351 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
4352 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
4353 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
4354 }
4355 }
4356
4357 out:
4358 /* Revert back uid & gid for the last time, and exit */
4359 /* no extra logging, as only the first already reported error matters */
4360 if (getuid() != saved_uid)
4361 (void) setreuid(saved_uid, -1);
4362
4363 if (getgid() != saved_gid)
4364 (void) setregid(saved_gid, -1);
4365
4366 return r;
4367 }
4368
4369 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
4370 assert(array);
4371 assert(n);
4372 assert(pair);
4373
4374 if (pair[0] >= 0)
4375 array[(*n)++] = pair[0];
4376 if (pair[1] >= 0)
4377 array[(*n)++] = pair[1];
4378 }
4379
4380 static int close_remaining_fds(
4381 const ExecParameters *params,
4382 const ExecRuntime *runtime,
4383 int user_lookup_fd,
4384 int socket_fd,
4385 const int *fds, size_t n_fds) {
4386
4387 size_t n_dont_close = 0;
4388 int dont_close[n_fds + 14];
4389
4390 assert(params);
4391
4392 if (params->stdin_fd >= 0)
4393 dont_close[n_dont_close++] = params->stdin_fd;
4394 if (params->stdout_fd >= 0)
4395 dont_close[n_dont_close++] = params->stdout_fd;
4396 if (params->stderr_fd >= 0)
4397 dont_close[n_dont_close++] = params->stderr_fd;
4398
4399 if (socket_fd >= 0)
4400 dont_close[n_dont_close++] = socket_fd;
4401 if (n_fds > 0) {
4402 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4403 n_dont_close += n_fds;
4404 }
4405
4406 if (runtime)
4407 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
4408
4409 if (runtime && runtime->shared) {
4410 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
4411 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
4412 }
4413
4414 if (runtime && runtime->dynamic_creds) {
4415 if (runtime->dynamic_creds->user)
4416 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
4417 if (runtime->dynamic_creds->group)
4418 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
4419 }
4420
4421 if (user_lookup_fd >= 0)
4422 dont_close[n_dont_close++] = user_lookup_fd;
4423
4424 return close_all_fds(dont_close, n_dont_close);
4425 }
4426
4427 static int send_user_lookup(
4428 Unit *unit,
4429 int user_lookup_fd,
4430 uid_t uid,
4431 gid_t gid) {
4432
4433 assert(unit);
4434
4435 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4436 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4437 * specified. */
4438
4439 if (user_lookup_fd < 0)
4440 return 0;
4441
4442 if (!uid_is_valid(uid) && !gid_is_valid(gid))
4443 return 0;
4444
4445 if (writev(user_lookup_fd,
4446 (struct iovec[]) {
4447 IOVEC_MAKE(&uid, sizeof(uid)),
4448 IOVEC_MAKE(&gid, sizeof(gid)),
4449 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
4450 return -errno;
4451
4452 return 0;
4453 }
4454
4455 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4456 int r;
4457
4458 assert(c);
4459 assert(home);
4460 assert(buf);
4461
4462 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4463
4464 if (*home)
4465 return 0;
4466
4467 if (!c->working_directory_home)
4468 return 0;
4469
4470 r = get_home_dir(buf);
4471 if (r < 0)
4472 return r;
4473
4474 *home = *buf;
4475 return 1;
4476 }
4477
4478 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4479 _cleanup_strv_free_ char ** list = NULL;
4480 int r;
4481
4482 assert(c);
4483 assert(p);
4484 assert(ret);
4485
4486 assert(c->dynamic_user);
4487
4488 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4489 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4490 * directories. */
4491
4492 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4493 if (t == EXEC_DIRECTORY_CONFIGURATION)
4494 continue;
4495
4496 if (!p->prefix[t])
4497 continue;
4498
4499 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4500 char *e;
4501
4502 if (exec_directory_is_private(c, t))
4503 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4504 else
4505 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4506 if (!e)
4507 return -ENOMEM;
4508
4509 r = strv_consume(&list, e);
4510 if (r < 0)
4511 return r;
4512 }
4513 }
4514
4515 *ret = TAKE_PTR(list);
4516
4517 return 0;
4518 }
4519
4520 static int exec_parameters_get_cgroup_path(
4521 const ExecParameters *params,
4522 const CGroupContext *c,
4523 char **ret) {
4524
4525 const char *subgroup = NULL;
4526 char *p;
4527
4528 assert(params);
4529 assert(ret);
4530
4531 if (!params->cgroup_path)
4532 return -EINVAL;
4533
4534 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4535 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4536 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4537 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4538 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4539 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4540 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4541 * flag, which is only passed for the former statements, not for the latter. */
4542
4543 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
4544 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
4545 subgroup = ".control";
4546 else
4547 subgroup = c->delegate_subgroup;
4548 }
4549
4550 if (subgroup)
4551 p = path_join(params->cgroup_path, subgroup);
4552 else
4553 p = strdup(params->cgroup_path);
4554 if (!p)
4555 return -ENOMEM;
4556
4557 *ret = p;
4558 return !!subgroup;
4559 }
4560
4561 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4562 _cleanup_(cpu_set_reset) CPUSet s = {};
4563 int r;
4564
4565 assert(c);
4566 assert(ret);
4567
4568 if (!c->numa_policy.nodes.set) {
4569 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4570 return 0;
4571 }
4572
4573 r = numa_to_cpu_set(&c->numa_policy, &s);
4574 if (r < 0)
4575 return r;
4576
4577 cpu_set_reset(ret);
4578
4579 return cpu_set_add_all(ret, &s);
4580 }
4581
4582 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4583 assert(c);
4584
4585 return c->cpu_affinity_from_numa;
4586 }
4587
4588 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4589 int r;
4590
4591 assert(fds);
4592 assert(n_fds);
4593 assert(*n_fds < fds_size);
4594 assert(ret_fd);
4595
4596 if (fd < 0) {
4597 *ret_fd = -EBADF;
4598 return 0;
4599 }
4600
4601 if (fd < 3 + (int) *n_fds) {
4602 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4603 * the fds we pass to the process (or which are closed only during execve). */
4604
4605 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4606 if (r < 0)
4607 return -errno;
4608
4609 close_and_replace(fd, r);
4610 }
4611
4612 *ret_fd = fds[*n_fds] = fd;
4613 (*n_fds) ++;
4614 return 1;
4615 }
4616
4617 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4618 union sockaddr_union addr = {
4619 .un.sun_family = AF_UNIX,
4620 };
4621 socklen_t sa_len;
4622 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4623 int r;
4624
4625 assert(u);
4626 assert(of);
4627 assert(ofd >= 0);
4628
4629 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4630 if (r < 0)
4631 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4632
4633 sa_len = r;
4634
4635 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4636 _cleanup_close_ int fd = -EBADF;
4637
4638 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4639 if (fd < 0)
4640 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4641
4642 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4643 if (r == -EPROTOTYPE)
4644 continue;
4645 if (r < 0)
4646 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4647
4648 return TAKE_FD(fd);
4649 }
4650
4651 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4652 }
4653
4654 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4655 struct stat st;
4656 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4657
4658 assert(u);
4659 assert(of);
4660
4661 ofd = open(of->path, O_PATH | O_CLOEXEC);
4662 if (ofd < 0)
4663 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4664
4665 if (fstat(ofd, &st) < 0)
4666 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
4667
4668 if (S_ISSOCK(st.st_mode)) {
4669 fd = connect_unix_harder(u, of, ofd);
4670 if (fd < 0)
4671 return fd;
4672
4673 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4674 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4675 of->path);
4676
4677 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4678 } else {
4679 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4680 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4681 flags |= O_APPEND;
4682 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4683 flags |= O_TRUNC;
4684
4685 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4686 if (fd < 0)
4687 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4688
4689 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4690 }
4691
4692 return TAKE_FD(fd);
4693 }
4694
4695 static int collect_open_file_fds(
4696 Unit *u,
4697 OpenFile* open_files,
4698 int **fds,
4699 char ***fdnames,
4700 size_t *n_fds) {
4701 int r;
4702
4703 assert(u);
4704 assert(fds);
4705 assert(fdnames);
4706 assert(n_fds);
4707
4708 LIST_FOREACH(open_files, of, open_files) {
4709 _cleanup_close_ int fd = -EBADF;
4710
4711 fd = get_open_file_fd(u, of);
4712 if (fd < 0) {
4713 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4714 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4715 continue;
4716 }
4717
4718 return fd;
4719 }
4720
4721 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4722 return -ENOMEM;
4723
4724 r = strv_extend(fdnames, of->fdname);
4725 if (r < 0)
4726 return r;
4727
4728 (*fds)[*n_fds] = TAKE_FD(fd);
4729
4730 (*n_fds)++;
4731 }
4732
4733 return 0;
4734 }
4735
4736 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
4737 assert(unit);
4738 assert(msg);
4739 assert(executable);
4740
4741 if (!DEBUG_LOGGING)
4742 return;
4743
4744 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4745
4746 log_unit_struct(unit, LOG_DEBUG,
4747 "EXECUTABLE=%s", executable,
4748 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
4749 LOG_UNIT_INVOCATION_ID(unit));
4750 }
4751
4752 static bool exec_context_need_unprivileged_private_users(
4753 const ExecContext *context,
4754 const ExecParameters *params) {
4755
4756 assert(context);
4757 assert(params);
4758
4759 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4760 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4761 * (system manager) then we have privileges and don't need this. */
4762 if (params->runtime_scope != RUNTIME_SCOPE_USER)
4763 return false;
4764
4765 return context->private_users ||
4766 context->private_tmp ||
4767 context->private_devices ||
4768 context->private_network ||
4769 context->network_namespace_path ||
4770 context->private_ipc ||
4771 context->ipc_namespace_path ||
4772 context->private_mounts > 0 ||
4773 context->mount_apivfs ||
4774 context->n_bind_mounts > 0 ||
4775 context->n_temporary_filesystems > 0 ||
4776 context->root_directory ||
4777 !strv_isempty(context->extension_directories) ||
4778 context->protect_system != PROTECT_SYSTEM_NO ||
4779 context->protect_home != PROTECT_HOME_NO ||
4780 context->protect_kernel_tunables ||
4781 context->protect_kernel_modules ||
4782 context->protect_kernel_logs ||
4783 context->protect_control_groups ||
4784 context->protect_clock ||
4785 context->protect_hostname ||
4786 !strv_isempty(context->read_write_paths) ||
4787 !strv_isempty(context->read_only_paths) ||
4788 !strv_isempty(context->inaccessible_paths) ||
4789 !strv_isempty(context->exec_paths) ||
4790 !strv_isempty(context->no_exec_paths);
4791 }
4792
4793 static int exec_child(
4794 Unit *unit,
4795 const ExecCommand *command,
4796 const ExecContext *context,
4797 const ExecParameters *params,
4798 ExecRuntime *runtime,
4799 const CGroupContext *cgroup_context,
4800 int socket_fd,
4801 const int named_iofds[static 3],
4802 int *params_fds,
4803 size_t n_socket_fds,
4804 size_t n_storage_fds,
4805 char **files_env,
4806 int user_lookup_fd,
4807 int *exit_status) {
4808
4809 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4810 int r, ngids = 0, exec_fd;
4811 _cleanup_free_ gid_t *supplementary_gids = NULL;
4812 const char *username = NULL, *groupname = NULL;
4813 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4814 const char *home = NULL, *shell = NULL;
4815 char **final_argv = NULL;
4816 dev_t journal_stream_dev = 0;
4817 ino_t journal_stream_ino = 0;
4818 bool userns_set_up = false;
4819 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4820 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4821 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4822 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
4823 #if HAVE_SELINUX
4824 _cleanup_free_ char *mac_selinux_context_net = NULL;
4825 bool use_selinux = false;
4826 #endif
4827 #if ENABLE_SMACK
4828 bool use_smack = false;
4829 #endif
4830 #if HAVE_APPARMOR
4831 bool use_apparmor = false;
4832 #endif
4833 uid_t saved_uid = getuid();
4834 gid_t saved_gid = getgid();
4835 uid_t uid = UID_INVALID;
4836 gid_t gid = GID_INVALID;
4837 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4838 n_keep_fds; /* total number of fds not to close */
4839 int secure_bits;
4840 _cleanup_free_ gid_t *gids_after_pam = NULL;
4841 int ngids_after_pam = 0;
4842 _cleanup_free_ int *fds = NULL;
4843 _cleanup_strv_free_ char **fdnames = NULL;
4844
4845 assert(unit);
4846 assert(command);
4847 assert(context);
4848 assert(params);
4849 assert(exit_status);
4850
4851 /* Explicitly test for CVE-2021-4034 inspired invocations */
4852 assert(command->path);
4853 assert(!strv_isempty(command->argv));
4854
4855 rename_process_from_path(command->path);
4856
4857 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4858 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4859 * both of which will be demoted to SIG_DFL. */
4860 (void) default_signals(SIGNALS_CRASH_HANDLER,
4861 SIGNALS_IGNORE);
4862
4863 if (context->ignore_sigpipe)
4864 (void) ignore_signals(SIGPIPE);
4865
4866 r = reset_signal_mask();
4867 if (r < 0) {
4868 *exit_status = EXIT_SIGNAL_MASK;
4869 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4870 }
4871
4872 if (params->idle_pipe)
4873 do_idle_pipe_dance(params->idle_pipe);
4874
4875 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4876 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4877 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4878 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4879
4880 log_forget_fds();
4881 log_set_open_when_needed(true);
4882 log_settle_target();
4883
4884 /* In case anything used libc syslog(), close this here, too */
4885 closelog();
4886
4887 fds = newdup(int, params_fds, n_fds);
4888 if (!fds) {
4889 *exit_status = EXIT_MEMORY;
4890 return log_oom();
4891 }
4892
4893 fdnames = strv_copy((char**) params->fd_names);
4894 if (!fdnames) {
4895 *exit_status = EXIT_MEMORY;
4896 return log_oom();
4897 }
4898
4899 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4900 if (r < 0) {
4901 *exit_status = EXIT_FDS;
4902 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4903 }
4904
4905 int keep_fds[n_fds + 3];
4906 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4907 n_keep_fds = n_fds;
4908
4909 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4910 if (r < 0) {
4911 *exit_status = EXIT_FDS;
4912 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4913 }
4914
4915 #if HAVE_LIBBPF
4916 if (unit->manager->restrict_fs) {
4917 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4918 if (bpf_map_fd < 0) {
4919 *exit_status = EXIT_FDS;
4920 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4921 }
4922
4923 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4924 if (r < 0) {
4925 *exit_status = EXIT_FDS;
4926 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4927 }
4928 }
4929 #endif
4930
4931 r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4932 if (r < 0) {
4933 *exit_status = EXIT_FDS;
4934 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4935 }
4936
4937 if (!context->same_pgrp &&
4938 setsid() < 0) {
4939 *exit_status = EXIT_SETSID;
4940 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4941 }
4942
4943 exec_context_tty_reset(context, params);
4944
4945 if (unit_shall_confirm_spawn(unit)) {
4946 _cleanup_free_ char *cmdline = NULL;
4947
4948 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4949 if (!cmdline) {
4950 *exit_status = EXIT_MEMORY;
4951 return log_oom();
4952 }
4953
4954 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4955 if (r != CONFIRM_EXECUTE) {
4956 if (r == CONFIRM_PRETEND_SUCCESS) {
4957 *exit_status = EXIT_SUCCESS;
4958 return 0;
4959 }
4960 *exit_status = EXIT_CONFIRM;
4961 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4962 "Execution cancelled by the user");
4963 }
4964 }
4965
4966 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4967 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4968 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4969 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4970 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4971 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4972 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4973 *exit_status = EXIT_MEMORY;
4974 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4975 }
4976
4977 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4978 _cleanup_strv_free_ char **suggested_paths = NULL;
4979
4980 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4981 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4982 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4983 *exit_status = EXIT_USER;
4984 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4985 }
4986
4987 r = compile_suggested_paths(context, params, &suggested_paths);
4988 if (r < 0) {
4989 *exit_status = EXIT_MEMORY;
4990 return log_oom();
4991 }
4992
4993 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4994 if (r < 0) {
4995 *exit_status = EXIT_USER;
4996 if (r == -EILSEQ)
4997 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4998 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4999 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
5000 }
5001
5002 if (!uid_is_valid(uid)) {
5003 *exit_status = EXIT_USER;
5004 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
5005 }
5006
5007 if (!gid_is_valid(gid)) {
5008 *exit_status = EXIT_USER;
5009 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
5010 }
5011
5012 if (runtime->dynamic_creds->user)
5013 username = runtime->dynamic_creds->user->name;
5014
5015 } else {
5016 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
5017 if (r < 0) {
5018 *exit_status = EXIT_USER;
5019 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5020 }
5021
5022 r = get_fixed_group(context, &groupname, &gid);
5023 if (r < 0) {
5024 *exit_status = EXIT_GROUP;
5025 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
5026 }
5027 }
5028
5029 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5030 r = get_supplementary_groups(context, username, groupname, gid,
5031 &supplementary_gids, &ngids);
5032 if (r < 0) {
5033 *exit_status = EXIT_GROUP;
5034 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
5035 }
5036
5037 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
5038 if (r < 0) {
5039 *exit_status = EXIT_USER;
5040 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
5041 }
5042
5043 user_lookup_fd = safe_close(user_lookup_fd);
5044
5045 r = acquire_home(context, uid, &home, &home_buffer);
5046 if (r < 0) {
5047 *exit_status = EXIT_CHDIR;
5048 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
5049 }
5050
5051 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5052 if (socket_fd >= 0)
5053 (void) fd_nonblock(socket_fd, false);
5054
5055 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5056 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5057 if (params->cgroup_path) {
5058 _cleanup_free_ char *p = NULL;
5059
5060 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5061 if (r < 0) {
5062 *exit_status = EXIT_CGROUP;
5063 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5064 }
5065
5066 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
5067 if (r == -EUCLEAN) {
5068 *exit_status = EXIT_CGROUP;
5069 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
5070 "because the cgroup or one of its parents or "
5071 "siblings is in the threaded mode: %m", p);
5072 }
5073 if (r < 0) {
5074 *exit_status = EXIT_CGROUP;
5075 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
5076 }
5077 }
5078
5079 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5080 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5081 if (r < 0) {
5082 *exit_status = EXIT_NETWORK;
5083 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5084 }
5085 }
5086
5087 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5088 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5089 if (r < 0) {
5090 *exit_status = EXIT_NAMESPACE;
5091 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5092 }
5093 }
5094
5095 r = setup_input(context, params, socket_fd, named_iofds);
5096 if (r < 0) {
5097 *exit_status = EXIT_STDIN;
5098 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
5099 }
5100
5101 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5102 if (r < 0) {
5103 *exit_status = EXIT_STDOUT;
5104 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
5105 }
5106
5107 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5108 if (r < 0) {
5109 *exit_status = EXIT_STDERR;
5110 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
5111 }
5112
5113 if (context->oom_score_adjust_set) {
5114 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
5115 * prohibit write access to this file, and we shouldn't trip up over that. */
5116 r = set_oom_score_adjust(context->oom_score_adjust);
5117 if (ERRNO_IS_PRIVILEGE(r))
5118 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5119 else if (r < 0) {
5120 *exit_status = EXIT_OOM_ADJUST;
5121 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
5122 }
5123 }
5124
5125 if (context->coredump_filter_set) {
5126 r = set_coredump_filter(context->coredump_filter);
5127 if (ERRNO_IS_PRIVILEGE(r))
5128 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5129 else if (r < 0)
5130 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5131 }
5132
5133 if (context->nice_set) {
5134 r = setpriority_closest(context->nice);
5135 if (r < 0)
5136 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5137 }
5138
5139 if (context->cpu_sched_set) {
5140 struct sched_param param = {
5141 .sched_priority = context->cpu_sched_priority,
5142 };
5143
5144 r = sched_setscheduler(0,
5145 context->cpu_sched_policy |
5146 (context->cpu_sched_reset_on_fork ?
5147 SCHED_RESET_ON_FORK : 0),
5148 &param);
5149 if (r < 0) {
5150 *exit_status = EXIT_SETSCHEDULER;
5151 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
5152 }
5153 }
5154
5155 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5156 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
5157 const CPUSet *cpu_set;
5158
5159 if (context->cpu_affinity_from_numa) {
5160 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5161 if (r < 0) {
5162 *exit_status = EXIT_CPUAFFINITY;
5163 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5164 }
5165
5166 cpu_set = &converted_cpu_set;
5167 } else
5168 cpu_set = &context->cpu_set;
5169
5170 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5171 *exit_status = EXIT_CPUAFFINITY;
5172 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
5173 }
5174 }
5175
5176 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5177 r = apply_numa_policy(&context->numa_policy);
5178 if (r < 0) {
5179 if (ERRNO_IS_NOT_SUPPORTED(r))
5180 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
5181 else {
5182 *exit_status = EXIT_NUMA_POLICY;
5183 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
5184 }
5185 }
5186 }
5187
5188 if (context->ioprio_set)
5189 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5190 *exit_status = EXIT_IOPRIO;
5191 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
5192 }
5193
5194 if (context->timer_slack_nsec != NSEC_INFINITY)
5195 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5196 *exit_status = EXIT_TIMERSLACK;
5197 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
5198 }
5199
5200 if (context->personality != PERSONALITY_INVALID) {
5201 r = safe_personality(context->personality);
5202 if (r < 0) {
5203 *exit_status = EXIT_PERSONALITY;
5204 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
5205 }
5206 }
5207
5208 if (context->utmp_id) {
5209 const char *line = context->tty_path ?
5210 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5211 NULL;
5212 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5213 line,
5214 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
5215 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5216 USER_PROCESS,
5217 username);
5218 }
5219
5220 if (uid_is_valid(uid)) {
5221 r = chown_terminal(STDIN_FILENO, uid);
5222 if (r < 0) {
5223 *exit_status = EXIT_STDIN;
5224 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
5225 }
5226 }
5227
5228 if (params->cgroup_path) {
5229 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5230 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5231 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5232 * touch a single hierarchy too. */
5233
5234 if (params->flags & EXEC_CGROUP_DELEGATE) {
5235 _cleanup_free_ char *p = NULL;
5236
5237 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
5238 if (r < 0) {
5239 *exit_status = EXIT_CGROUP;
5240 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
5241 }
5242
5243 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5244 if (r < 0) {
5245 *exit_status = EXIT_CGROUP;
5246 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5247 }
5248 if (r > 0) {
5249 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
5250 if (r < 0) {
5251 *exit_status = EXIT_CGROUP;
5252 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
5253 }
5254 }
5255 }
5256
5257 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
5258 if (cgroup_context_want_memory_pressure(cgroup_context)) {
5259 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5260 if (r < 0) {
5261 *exit_status = EXIT_MEMORY;
5262 return log_oom();
5263 }
5264
5265 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5266 if (r < 0) {
5267 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5268 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5269 memory_pressure_path = mfree(memory_pressure_path);
5270 }
5271 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
5272 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5273 if (!memory_pressure_path) {
5274 *exit_status = EXIT_MEMORY;
5275 return log_oom();
5276 }
5277 }
5278 }
5279 }
5280
5281 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5282
5283 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5284 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5285 if (r < 0)
5286 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5287 }
5288
5289 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
5290 r = setup_credentials(context, params, unit->id, uid);
5291 if (r < 0) {
5292 *exit_status = EXIT_CREDENTIALS;
5293 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
5294 }
5295 }
5296
5297 r = build_environment(
5298 unit,
5299 context,
5300 params,
5301 cgroup_context,
5302 n_fds,
5303 fdnames,
5304 home,
5305 username,
5306 shell,
5307 journal_stream_dev,
5308 journal_stream_ino,
5309 memory_pressure_path,
5310 &our_env);
5311 if (r < 0) {
5312 *exit_status = EXIT_MEMORY;
5313 return log_oom();
5314 }
5315
5316 r = build_pass_environment(context, &pass_env);
5317 if (r < 0) {
5318 *exit_status = EXIT_MEMORY;
5319 return log_oom();
5320 }
5321
5322 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5323 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5324 * not specify PATH but the unit has ExecSearchPath. */
5325 if (!strv_isempty(context->exec_search_path)) {
5326 _cleanup_free_ char *joined = NULL;
5327
5328 joined = strv_join(context->exec_search_path, ":");
5329 if (!joined) {
5330 *exit_status = EXIT_MEMORY;
5331 return log_oom();
5332 }
5333
5334 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5335 if (r < 0) {
5336 *exit_status = EXIT_MEMORY;
5337 return log_oom();
5338 }
5339 }
5340
5341 accum_env = strv_env_merge(params->environment,
5342 our_env,
5343 joined_exec_search_path,
5344 pass_env,
5345 context->environment,
5346 files_env);
5347 if (!accum_env) {
5348 *exit_status = EXIT_MEMORY;
5349 return log_oom();
5350 }
5351 accum_env = strv_env_clean(accum_env);
5352
5353 (void) umask(context->umask);
5354
5355 r = setup_keyring(unit, context, params, uid, gid);
5356 if (r < 0) {
5357 *exit_status = EXIT_KEYRING;
5358 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
5359 }
5360
5361 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5362 * from it. */
5363 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
5364
5365 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5366 * for it, and the kernel doesn't actually support ambient caps. */
5367 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
5368
5369 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5370 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5371 * desired. */
5372 if (needs_ambient_hack)
5373 needs_setuid = false;
5374 else
5375 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5376
5377 uint64_t capability_ambient_set = context->capability_ambient_set;
5378
5379 if (needs_sandboxing) {
5380 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5381 * /sys being present. The actual MAC context application will happen later, as late as
5382 * possible, to avoid impacting our own code paths. */
5383
5384 #if HAVE_SELINUX
5385 use_selinux = mac_selinux_use();
5386 #endif
5387 #if ENABLE_SMACK
5388 use_smack = mac_smack_use();
5389 #endif
5390 #if HAVE_APPARMOR
5391 use_apparmor = mac_apparmor_use();
5392 #endif
5393 }
5394
5395 if (needs_sandboxing) {
5396 int which_failed;
5397
5398 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5399 * is set here. (See below.) */
5400
5401 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5402 if (r < 0) {
5403 *exit_status = EXIT_LIMITS;
5404 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5405 }
5406 }
5407
5408 if (needs_setuid && context->pam_name && username) {
5409 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5410 * wins here. (See above.) */
5411
5412 /* All fds passed in the fds array will be closed in the pam child process. */
5413 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
5414 if (r < 0) {
5415 *exit_status = EXIT_PAM;
5416 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
5417 }
5418
5419 if (ambient_capabilities_supported()) {
5420 uint64_t ambient_after_pam;
5421
5422 /* PAM modules might have set some ambient caps. Query them here and merge them into
5423 * the caps we want to set in the end, so that we don't end up unsetting them. */
5424 r = capability_get_ambient(&ambient_after_pam);
5425 if (r < 0) {
5426 *exit_status = EXIT_CAPABILITIES;
5427 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
5428 }
5429
5430 capability_ambient_set |= ambient_after_pam;
5431 }
5432
5433 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5434 if (ngids_after_pam < 0) {
5435 *exit_status = EXIT_MEMORY;
5436 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5437 }
5438 }
5439
5440 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5441 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5442 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5443 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5444
5445 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5446 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5447 * the actual requested operations fail (or silently continue). */
5448 if (r < 0 && context->private_users) {
5449 *exit_status = EXIT_USER;
5450 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5451 }
5452 if (r < 0)
5453 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5454 else
5455 userns_set_up = true;
5456 }
5457
5458 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5459
5460 /* Try to enable network namespacing if network namespacing is available and we have
5461 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5462 * new network namespace. And if we don't have that, then we could only create a network
5463 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5464 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
5465 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
5466 if (r < 0) {
5467 if (ERRNO_IS_PRIVILEGE(r))
5468 log_unit_notice_errno(unit, r,
5469 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5470 else {
5471 *exit_status = EXIT_NETWORK;
5472 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
5473 }
5474 }
5475 } else if (context->network_namespace_path) {
5476 *exit_status = EXIT_NETWORK;
5477 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5478 "NetworkNamespacePath= is not supported, refusing.");
5479 } else
5480 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5481 }
5482
5483 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5484
5485 if (ns_type_supported(NAMESPACE_IPC)) {
5486 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
5487 if (r == -EPERM)
5488 log_unit_warning_errno(unit, r,
5489 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5490 else if (r < 0) {
5491 *exit_status = EXIT_NAMESPACE;
5492 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
5493 }
5494 } else if (context->ipc_namespace_path) {
5495 *exit_status = EXIT_NAMESPACE;
5496 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5497 "IPCNamespacePath= is not supported, refusing.");
5498 } else
5499 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5500 }
5501
5502 if (needs_mount_namespace) {
5503 _cleanup_free_ char *error_path = NULL;
5504
5505 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
5506 if (r < 0) {
5507 *exit_status = EXIT_NAMESPACE;
5508 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5509 error_path ? ": " : "", strempty(error_path));
5510 }
5511 }
5512
5513 if (needs_sandboxing) {
5514 r = apply_protect_hostname(unit, context, exit_status);
5515 if (r < 0)
5516 return r;
5517 }
5518
5519 if (context->memory_ksm >= 0)
5520 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
5521 if (ERRNO_IS_NOT_SUPPORTED(errno))
5522 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
5523 else {
5524 *exit_status = EXIT_KSM;
5525 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
5526 }
5527 }
5528
5529 /* Drop groups as early as possible.
5530 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5531 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5532 if (needs_setuid) {
5533 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5534 int ngids_to_enforce = 0;
5535
5536 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5537 ngids,
5538 gids_after_pam,
5539 ngids_after_pam,
5540 &gids_to_enforce);
5541 if (ngids_to_enforce < 0) {
5542 *exit_status = EXIT_MEMORY;
5543 return log_unit_error_errno(unit,
5544 ngids_to_enforce,
5545 "Failed to merge group lists. Group membership might be incorrect: %m");
5546 }
5547
5548 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5549 if (r < 0) {
5550 *exit_status = EXIT_GROUP;
5551 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
5552 }
5553 }
5554
5555 /* If the user namespace was not set up above, try to do it now.
5556 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5557 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5558 * case of mount namespaces being less privileged when the mount point list is copied from a
5559 * different user namespace). */
5560
5561 if (needs_sandboxing && context->private_users && !userns_set_up) {
5562 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5563 if (r < 0) {
5564 *exit_status = EXIT_USER;
5565 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
5566 }
5567 }
5568
5569 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5570 * shall execute. */
5571
5572 _cleanup_free_ char *executable = NULL;
5573 _cleanup_close_ int executable_fd = -EBADF;
5574 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5575 if (r < 0) {
5576 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
5577 log_unit_struct_errno(unit, LOG_INFO, r,
5578 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5579 LOG_UNIT_INVOCATION_ID(unit),
5580 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5581 command->path),
5582 "EXECUTABLE=%s", command->path);
5583 return 0;
5584 }
5585
5586 *exit_status = EXIT_EXEC;
5587
5588 return log_unit_struct_errno(unit, LOG_INFO, r,
5589 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5590 LOG_UNIT_INVOCATION_ID(unit),
5591 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5592 command->path),
5593 "EXECUTABLE=%s", command->path);
5594 }
5595
5596 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5597 if (r < 0) {
5598 *exit_status = EXIT_FDS;
5599 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5600 }
5601
5602 #if HAVE_SELINUX
5603 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5604 int fd = -EBADF;
5605
5606 if (socket_fd >= 0)
5607 fd = socket_fd;
5608 else if (params->n_socket_fds == 1)
5609 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5610 * use context from that fd to compute the label. */
5611 fd = params->fds[0];
5612
5613 if (fd >= 0) {
5614 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5615 if (r < 0) {
5616 if (!context->selinux_context_ignore) {
5617 *exit_status = EXIT_SELINUX_CONTEXT;
5618 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5619 }
5620 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
5621 }
5622 }
5623 }
5624 #endif
5625
5626 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5627 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5628 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5629 * execve(). */
5630
5631 r = close_all_fds(keep_fds, n_keep_fds);
5632 if (r >= 0)
5633 r = shift_fds(fds, n_fds);
5634 if (r >= 0)
5635 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
5636 if (r < 0) {
5637 *exit_status = EXIT_FDS;
5638 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
5639 }
5640
5641 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5642 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5643 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5644 * came this far. */
5645
5646 secure_bits = context->secure_bits;
5647
5648 if (needs_sandboxing) {
5649 uint64_t bset;
5650
5651 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5652 * (Note this is placed after the general resource limit initialization, see above, in order
5653 * to take precedence.) */
5654 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5655 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5656 *exit_status = EXIT_LIMITS;
5657 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5658 }
5659 }
5660
5661 #if ENABLE_SMACK
5662 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5663 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5664 if (use_smack) {
5665 r = setup_smack(unit->manager, context, executable_fd);
5666 if (r < 0 && !context->smack_process_label_ignore) {
5667 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5668 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5669 }
5670 }
5671 #endif
5672
5673 bset = context->capability_bounding_set;
5674 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5675 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5676 * instead of us doing that */
5677 if (needs_ambient_hack)
5678 bset |= (UINT64_C(1) << CAP_SETPCAP) |
5679 (UINT64_C(1) << CAP_SETUID) |
5680 (UINT64_C(1) << CAP_SETGID);
5681
5682 if (!cap_test_all(bset)) {
5683 r = capability_bounding_set_drop(bset, /* right_now= */ false);
5684 if (r < 0) {
5685 *exit_status = EXIT_CAPABILITIES;
5686 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5687 }
5688 }
5689
5690 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5691 * keep-caps set.
5692 *
5693 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5694 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5695 * the ambient capabilities can be raised as they are present in the permitted and
5696 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5697 * without changing the user, so we also set the ambient capabilities here.
5698 *
5699 * The requested ambient capabilities are raised in the inheritable set if the second
5700 * argument is true. */
5701 if (!needs_ambient_hack) {
5702 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5703 if (r < 0) {
5704 *exit_status = EXIT_CAPABILITIES;
5705 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5706 }
5707 }
5708 }
5709
5710 /* chroot to root directory first, before we lose the ability to chroot */
5711 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5712 if (r < 0)
5713 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5714
5715 if (needs_setuid) {
5716 if (uid_is_valid(uid)) {
5717 r = enforce_user(context, uid, capability_ambient_set);
5718 if (r < 0) {
5719 *exit_status = EXIT_USER;
5720 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5721 }
5722
5723 if (!needs_ambient_hack && capability_ambient_set != 0) {
5724
5725 /* Raise the ambient capabilities after user change. */
5726 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5727 if (r < 0) {
5728 *exit_status = EXIT_CAPABILITIES;
5729 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5730 }
5731 }
5732 }
5733 }
5734
5735 /* Apply working directory here, because the working directory might be on NFS and only the user running
5736 * this service might have the correct privilege to change to the working directory */
5737 r = apply_working_directory(context, params, runtime, home, exit_status);
5738 if (r < 0)
5739 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5740
5741 if (needs_sandboxing) {
5742 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5743 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5744 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5745 * are restricted. */
5746
5747 #if HAVE_SELINUX
5748 if (use_selinux) {
5749 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5750
5751 if (exec_context) {
5752 r = setexeccon(exec_context);
5753 if (r < 0) {
5754 if (!context->selinux_context_ignore) {
5755 *exit_status = EXIT_SELINUX_CONTEXT;
5756 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5757 }
5758 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5759 }
5760 }
5761 }
5762 #endif
5763
5764 #if HAVE_APPARMOR
5765 if (use_apparmor && context->apparmor_profile) {
5766 r = aa_change_onexec(context->apparmor_profile);
5767 if (r < 0 && !context->apparmor_profile_ignore) {
5768 *exit_status = EXIT_APPARMOR_PROFILE;
5769 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5770 }
5771 }
5772 #endif
5773
5774 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5775 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5776 * requires CAP_SETPCAP. */
5777 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5778 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5779 * effective set here.
5780 *
5781 * The effective set is overwritten during execve() with the following values:
5782 *
5783 * - ambient set (for non-root processes)
5784 *
5785 * - (inheritable | bounding) set for root processes)
5786 *
5787 * Hence there is no security impact to raise it in the effective set before execve
5788 */
5789 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5790 if (r < 0) {
5791 *exit_status = EXIT_CAPABILITIES;
5792 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5793 }
5794 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5795 *exit_status = EXIT_SECUREBITS;
5796 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5797 }
5798 }
5799
5800 if (context_has_no_new_privileges(context))
5801 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5802 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5803 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5804 }
5805
5806 #if HAVE_SECCOMP
5807 r = apply_address_families(unit, context);
5808 if (r < 0) {
5809 *exit_status = EXIT_ADDRESS_FAMILIES;
5810 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5811 }
5812
5813 r = apply_memory_deny_write_execute(unit, context);
5814 if (r < 0) {
5815 *exit_status = EXIT_SECCOMP;
5816 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5817 }
5818
5819 r = apply_restrict_realtime(unit, context);
5820 if (r < 0) {
5821 *exit_status = EXIT_SECCOMP;
5822 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5823 }
5824
5825 r = apply_restrict_suid_sgid(unit, context);
5826 if (r < 0) {
5827 *exit_status = EXIT_SECCOMP;
5828 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5829 }
5830
5831 r = apply_restrict_namespaces(unit, context);
5832 if (r < 0) {
5833 *exit_status = EXIT_SECCOMP;
5834 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5835 }
5836
5837 r = apply_protect_sysctl(unit, context);
5838 if (r < 0) {
5839 *exit_status = EXIT_SECCOMP;
5840 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5841 }
5842
5843 r = apply_protect_kernel_modules(unit, context);
5844 if (r < 0) {
5845 *exit_status = EXIT_SECCOMP;
5846 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5847 }
5848
5849 r = apply_protect_kernel_logs(unit, context);
5850 if (r < 0) {
5851 *exit_status = EXIT_SECCOMP;
5852 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5853 }
5854
5855 r = apply_protect_clock(unit, context);
5856 if (r < 0) {
5857 *exit_status = EXIT_SECCOMP;
5858 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5859 }
5860
5861 r = apply_private_devices(unit, context);
5862 if (r < 0) {
5863 *exit_status = EXIT_SECCOMP;
5864 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5865 }
5866
5867 r = apply_syscall_archs(unit, context);
5868 if (r < 0) {
5869 *exit_status = EXIT_SECCOMP;
5870 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5871 }
5872
5873 r = apply_lock_personality(unit, context);
5874 if (r < 0) {
5875 *exit_status = EXIT_SECCOMP;
5876 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5877 }
5878
5879 r = apply_syscall_log(unit, context);
5880 if (r < 0) {
5881 *exit_status = EXIT_SECCOMP;
5882 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5883 }
5884
5885 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5886 * by the filter as little as possible. */
5887 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5888 if (r < 0) {
5889 *exit_status = EXIT_SECCOMP;
5890 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5891 }
5892 #endif
5893
5894 #if HAVE_LIBBPF
5895 r = apply_restrict_filesystems(unit, context);
5896 if (r < 0) {
5897 *exit_status = EXIT_BPF;
5898 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5899 }
5900 #endif
5901
5902 }
5903
5904 if (!strv_isempty(context->unset_environment)) {
5905 char **ee = NULL;
5906
5907 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5908 if (!ee) {
5909 *exit_status = EXIT_MEMORY;
5910 return log_oom();
5911 }
5912
5913 strv_free_and_replace(accum_env, ee);
5914 }
5915
5916 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5917 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5918
5919 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5920 if (r < 0) {
5921 *exit_status = EXIT_MEMORY;
5922 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5923 }
5924 final_argv = replaced_argv;
5925
5926 if (!strv_isempty(unset_variables)) {
5927 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5928 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5929 }
5930
5931 if (!strv_isempty(bad_variables)) {
5932 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5933 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5934 }
5935 } else
5936 final_argv = command->argv;
5937
5938 log_command_line(unit, "Executing", executable, final_argv);
5939
5940 if (exec_fd >= 0) {
5941 uint8_t hot = 1;
5942
5943 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5944 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5945
5946 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5947 *exit_status = EXIT_EXEC;
5948 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5949 }
5950 }
5951
5952 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5953
5954 if (exec_fd >= 0) {
5955 uint8_t hot = 0;
5956
5957 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5958 * that POLLHUP on it no longer means execve() succeeded. */
5959
5960 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5961 *exit_status = EXIT_EXEC;
5962 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5963 }
5964 }
5965
5966 *exit_status = EXIT_EXEC;
5967 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5968 }
5969
5970 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5971 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5972
5973 int exec_spawn(Unit *unit,
5974 ExecCommand *command,
5975 const ExecContext *context,
5976 const ExecParameters *params,
5977 ExecRuntime *runtime,
5978 const CGroupContext *cgroup_context,
5979 pid_t *ret) {
5980
5981 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5982 _cleanup_free_ char *subcgroup_path = NULL;
5983 _cleanup_strv_free_ char **files_env = NULL;
5984 size_t n_storage_fds = 0, n_socket_fds = 0;
5985 pid_t pid;
5986
5987 assert(unit);
5988 assert(command);
5989 assert(context);
5990 assert(ret);
5991 assert(params);
5992 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5993
5994 LOG_CONTEXT_PUSH_UNIT(unit);
5995
5996 if (context->std_input == EXEC_INPUT_SOCKET ||
5997 context->std_output == EXEC_OUTPUT_SOCKET ||
5998 context->std_error == EXEC_OUTPUT_SOCKET) {
5999
6000 if (params->n_socket_fds > 1)
6001 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
6002
6003 if (params->n_socket_fds == 0)
6004 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
6005
6006 socket_fd = params->fds[0];
6007 } else {
6008 socket_fd = -EBADF;
6009 fds = params->fds;
6010 n_socket_fds = params->n_socket_fds;
6011 n_storage_fds = params->n_storage_fds;
6012 }
6013
6014 r = exec_context_named_iofds(context, params, named_iofds);
6015 if (r < 0)
6016 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
6017
6018 r = exec_context_load_environment(unit, context, &files_env);
6019 if (r < 0)
6020 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
6021
6022 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6023 and, until the next SELinux policy changes, we save further reloads in future children. */
6024 mac_selinux_maybe_reload();
6025
6026 /* We won't know the real executable path until we create the mount namespace in the child, but we
6027 want to log from the parent, so we use the possibly inaccurate path here. */
6028 log_command_line(unit, "About to execute", command->path, command->argv);
6029
6030 if (params->cgroup_path) {
6031 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
6032 if (r < 0)
6033 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
6034 if (r > 0) {
6035 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6036 * realized by the unit logic) */
6037
6038 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
6039 if (r < 0)
6040 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
6041 }
6042 }
6043
6044 pid = fork();
6045 if (pid < 0)
6046 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
6047
6048 if (pid == 0) {
6049 int exit_status = EXIT_SUCCESS;
6050
6051 r = exec_child(unit,
6052 command,
6053 context,
6054 params,
6055 runtime,
6056 cgroup_context,
6057 socket_fd,
6058 named_iofds,
6059 fds,
6060 n_socket_fds,
6061 n_storage_fds,
6062 files_env,
6063 unit->manager->user_lookup_fds[1],
6064 &exit_status);
6065
6066 if (r < 0) {
6067 const char *status =
6068 exit_status_to_string(exit_status,
6069 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
6070
6071 log_unit_struct_errno(unit, LOG_ERR, r,
6072 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
6073 LOG_UNIT_INVOCATION_ID(unit),
6074 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
6075 status, command->path),
6076 "EXECUTABLE=%s", command->path);
6077 }
6078
6079 _exit(exit_status);
6080 }
6081
6082 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
6083
6084 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6085 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6086 * process will be killed too). */
6087 if (subcgroup_path)
6088 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
6089
6090 exec_status_start(&command->exec_status, pid);
6091
6092 *ret = pid;
6093 return 0;
6094 }
6095
6096 void exec_context_init(ExecContext *c) {
6097 assert(c);
6098
6099 c->umask = 0022;
6100 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
6101 c->cpu_sched_policy = SCHED_OTHER;
6102 c->syslog_priority = LOG_DAEMON|LOG_INFO;
6103 c->syslog_level_prefix = true;
6104 c->ignore_sigpipe = true;
6105 c->timer_slack_nsec = NSEC_INFINITY;
6106 c->personality = PERSONALITY_INVALID;
6107 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6108 c->directories[t].mode = 0755;
6109 c->timeout_clean_usec = USEC_INFINITY;
6110 c->capability_bounding_set = CAP_MASK_UNSET;
6111 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
6112 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
6113 c->log_level_max = -1;
6114 #if HAVE_SECCOMP
6115 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
6116 #endif
6117 c->tty_rows = UINT_MAX;
6118 c->tty_cols = UINT_MAX;
6119 numa_policy_reset(&c->numa_policy);
6120 c->private_mounts = -1;
6121 c->memory_ksm = -1;
6122 }
6123
6124 void exec_context_done(ExecContext *c) {
6125 assert(c);
6126
6127 c->environment = strv_free(c->environment);
6128 c->environment_files = strv_free(c->environment_files);
6129 c->pass_environment = strv_free(c->pass_environment);
6130 c->unset_environment = strv_free(c->unset_environment);
6131
6132 rlimit_free_all(c->rlimit);
6133
6134 for (size_t l = 0; l < 3; l++) {
6135 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
6136 c->stdio_file[l] = mfree(c->stdio_file[l]);
6137 }
6138
6139 c->working_directory = mfree(c->working_directory);
6140 c->root_directory = mfree(c->root_directory);
6141 c->root_image = mfree(c->root_image);
6142 c->root_image_options = mount_options_free_all(c->root_image_options);
6143 c->root_hash = mfree(c->root_hash);
6144 c->root_hash_size = 0;
6145 c->root_hash_path = mfree(c->root_hash_path);
6146 c->root_hash_sig = mfree(c->root_hash_sig);
6147 c->root_hash_sig_size = 0;
6148 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
6149 c->root_verity = mfree(c->root_verity);
6150 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
6151 c->extension_directories = strv_free(c->extension_directories);
6152 c->tty_path = mfree(c->tty_path);
6153 c->syslog_identifier = mfree(c->syslog_identifier);
6154 c->user = mfree(c->user);
6155 c->group = mfree(c->group);
6156
6157 c->supplementary_groups = strv_free(c->supplementary_groups);
6158
6159 c->pam_name = mfree(c->pam_name);
6160
6161 c->read_only_paths = strv_free(c->read_only_paths);
6162 c->read_write_paths = strv_free(c->read_write_paths);
6163 c->inaccessible_paths = strv_free(c->inaccessible_paths);
6164 c->exec_paths = strv_free(c->exec_paths);
6165 c->no_exec_paths = strv_free(c->no_exec_paths);
6166 c->exec_search_path = strv_free(c->exec_search_path);
6167
6168 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
6169 c->bind_mounts = NULL;
6170 c->n_bind_mounts = 0;
6171 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
6172 c->temporary_filesystems = NULL;
6173 c->n_temporary_filesystems = 0;
6174 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
6175
6176 cpu_set_reset(&c->cpu_set);
6177 numa_policy_reset(&c->numa_policy);
6178
6179 c->utmp_id = mfree(c->utmp_id);
6180 c->selinux_context = mfree(c->selinux_context);
6181 c->apparmor_profile = mfree(c->apparmor_profile);
6182 c->smack_process_label = mfree(c->smack_process_label);
6183
6184 c->restrict_filesystems = set_free(c->restrict_filesystems);
6185
6186 c->syscall_filter = hashmap_free(c->syscall_filter);
6187 c->syscall_archs = set_free(c->syscall_archs);
6188 c->address_families = set_free(c->address_families);
6189
6190 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6191 exec_directory_done(&c->directories[t]);
6192
6193 c->log_level_max = -1;
6194
6195 exec_context_free_log_extra_fields(c);
6196 c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
6197 c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
6198
6199 c->log_ratelimit_interval_usec = 0;
6200 c->log_ratelimit_burst = 0;
6201
6202 c->stdin_data = mfree(c->stdin_data);
6203 c->stdin_data_size = 0;
6204
6205 c->network_namespace_path = mfree(c->network_namespace_path);
6206 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
6207
6208 c->log_namespace = mfree(c->log_namespace);
6209
6210 c->load_credentials = hashmap_free(c->load_credentials);
6211 c->set_credentials = hashmap_free(c->set_credentials);
6212 c->import_credentials = set_free(c->import_credentials);
6213
6214 c->root_image_policy = image_policy_free(c->root_image_policy);
6215 c->mount_image_policy = image_policy_free(c->mount_image_policy);
6216 c->extension_image_policy = image_policy_free(c->extension_image_policy);
6217 }
6218
6219 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
6220 assert(c);
6221
6222 if (!runtime_prefix)
6223 return 0;
6224
6225 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
6226 _cleanup_free_ char *p = NULL;
6227
6228 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6229 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6230 else
6231 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6232 if (!p)
6233 return -ENOMEM;
6234
6235 /* We execute this synchronously, since we need to be sure this is gone when we start the
6236 * service next. */
6237 (void) rm_rf(p, REMOVE_ROOT);
6238
6239 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
6240 _cleanup_free_ char *symlink_abs = NULL;
6241
6242 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6243 symlink_abs = path_join(runtime_prefix, "private", *symlink);
6244 else
6245 symlink_abs = path_join(runtime_prefix, *symlink);
6246 if (!symlink_abs)
6247 return -ENOMEM;
6248
6249 (void) unlink(symlink_abs);
6250 }
6251 }
6252
6253 return 0;
6254 }
6255
6256 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
6257 _cleanup_free_ char *p = NULL;
6258
6259 assert(c);
6260
6261 if (!runtime_prefix || !unit)
6262 return 0;
6263
6264 p = path_join(runtime_prefix, "credentials", unit);
6265 if (!p)
6266 return -ENOMEM;
6267
6268 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6269 * unmount it, and afterwards remove the mount point */
6270 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6271 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
6272
6273 return 0;
6274 }
6275
6276 int exec_context_destroy_mount_ns_dir(Unit *u) {
6277 _cleanup_free_ char *p = NULL;
6278
6279 if (!u || !MANAGER_IS_SYSTEM(u->manager))
6280 return 0;
6281
6282 p = path_join("/run/systemd/propagate/", u->id);
6283 if (!p)
6284 return -ENOMEM;
6285
6286 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6287 if (rmdir(p) < 0 && errno != ENOENT)
6288 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
6289
6290 return 0;
6291 }
6292
6293 static void exec_command_done(ExecCommand *c) {
6294 assert(c);
6295
6296 c->path = mfree(c->path);
6297 c->argv = strv_free(c->argv);
6298 }
6299
6300 void exec_command_done_array(ExecCommand *c, size_t n) {
6301 for (size_t i = 0; i < n; i++)
6302 exec_command_done(c+i);
6303 }
6304
6305 ExecCommand* exec_command_free_list(ExecCommand *c) {
6306 ExecCommand *i;
6307
6308 while ((i = c)) {
6309 LIST_REMOVE(command, c, i);
6310 exec_command_done(i);
6311 free(i);
6312 }
6313
6314 return NULL;
6315 }
6316
6317 void exec_command_free_array(ExecCommand **c, size_t n) {
6318 for (size_t i = 0; i < n; i++)
6319 c[i] = exec_command_free_list(c[i]);
6320 }
6321
6322 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
6323 for (size_t i = 0; i < n; i++)
6324 exec_status_reset(&c[i].exec_status);
6325 }
6326
6327 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
6328 for (size_t i = 0; i < n; i++)
6329 LIST_FOREACH(command, z, c[i])
6330 exec_status_reset(&z->exec_status);
6331 }
6332
6333 typedef struct InvalidEnvInfo {
6334 const Unit *unit;
6335 const char *path;
6336 } InvalidEnvInfo;
6337
6338 static void invalid_env(const char *p, void *userdata) {
6339 InvalidEnvInfo *info = userdata;
6340
6341 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
6342 }
6343
6344 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
6345 assert(c);
6346
6347 switch (fd_index) {
6348
6349 case STDIN_FILENO:
6350 if (c->std_input != EXEC_INPUT_NAMED_FD)
6351 return NULL;
6352
6353 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
6354
6355 case STDOUT_FILENO:
6356 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
6357 return NULL;
6358
6359 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
6360
6361 case STDERR_FILENO:
6362 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
6363 return NULL;
6364
6365 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
6366
6367 default:
6368 return NULL;
6369 }
6370 }
6371
6372 static int exec_context_named_iofds(
6373 const ExecContext *c,
6374 const ExecParameters *p,
6375 int named_iofds[static 3]) {
6376
6377 size_t targets;
6378 const char* stdio_fdname[3];
6379 size_t n_fds;
6380
6381 assert(c);
6382 assert(p);
6383 assert(named_iofds);
6384
6385 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
6386 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
6387 (c->std_error == EXEC_OUTPUT_NAMED_FD);
6388
6389 for (size_t i = 0; i < 3; i++)
6390 stdio_fdname[i] = exec_context_fdname(c, i);
6391
6392 n_fds = p->n_storage_fds + p->n_socket_fds;
6393
6394 for (size_t i = 0; i < n_fds && targets > 0; i++)
6395 if (named_iofds[STDIN_FILENO] < 0 &&
6396 c->std_input == EXEC_INPUT_NAMED_FD &&
6397 stdio_fdname[STDIN_FILENO] &&
6398 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
6399
6400 named_iofds[STDIN_FILENO] = p->fds[i];
6401 targets--;
6402
6403 } else if (named_iofds[STDOUT_FILENO] < 0 &&
6404 c->std_output == EXEC_OUTPUT_NAMED_FD &&
6405 stdio_fdname[STDOUT_FILENO] &&
6406 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
6407
6408 named_iofds[STDOUT_FILENO] = p->fds[i];
6409 targets--;
6410
6411 } else if (named_iofds[STDERR_FILENO] < 0 &&
6412 c->std_error == EXEC_OUTPUT_NAMED_FD &&
6413 stdio_fdname[STDERR_FILENO] &&
6414 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
6415
6416 named_iofds[STDERR_FILENO] = p->fds[i];
6417 targets--;
6418 }
6419
6420 return targets == 0 ? 0 : -ENOENT;
6421 }
6422
6423 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
6424 _cleanup_strv_free_ char **v = NULL;
6425 int r;
6426
6427 assert(c);
6428 assert(ret);
6429
6430 STRV_FOREACH(i, c->environment_files) {
6431 _cleanup_globfree_ glob_t pglob = {};
6432 bool ignore = false;
6433 char *fn = *i;
6434
6435 if (fn[0] == '-') {
6436 ignore = true;
6437 fn++;
6438 }
6439
6440 if (!path_is_absolute(fn)) {
6441 if (ignore)
6442 continue;
6443 return -EINVAL;
6444 }
6445
6446 /* Filename supports globbing, take all matching files */
6447 r = safe_glob(fn, 0, &pglob);
6448 if (r < 0) {
6449 if (ignore)
6450 continue;
6451 return r;
6452 }
6453
6454 /* When we don't match anything, -ENOENT should be returned */
6455 assert(pglob.gl_pathc > 0);
6456
6457 for (size_t n = 0; n < pglob.gl_pathc; n++) {
6458 _cleanup_strv_free_ char **p = NULL;
6459
6460 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
6461 if (r < 0) {
6462 if (ignore)
6463 continue;
6464 return r;
6465 }
6466
6467 /* Log invalid environment variables with filename */
6468 if (p) {
6469 InvalidEnvInfo info = {
6470 .unit = unit,
6471 .path = pglob.gl_pathv[n]
6472 };
6473
6474 p = strv_env_clean_with_callback(p, invalid_env, &info);
6475 }
6476
6477 if (!v)
6478 v = TAKE_PTR(p);
6479 else {
6480 char **m = strv_env_merge(v, p);
6481 if (!m)
6482 return -ENOMEM;
6483
6484 strv_free_and_replace(v, m);
6485 }
6486 }
6487 }
6488
6489 *ret = TAKE_PTR(v);
6490
6491 return 0;
6492 }
6493
6494 static bool tty_may_match_dev_console(const char *tty) {
6495 _cleanup_free_ char *resolved = NULL;
6496
6497 if (!tty)
6498 return true;
6499
6500 tty = skip_dev_prefix(tty);
6501
6502 /* trivial identity? */
6503 if (streq(tty, "console"))
6504 return true;
6505
6506 if (resolve_dev_console(&resolved) < 0)
6507 return true; /* if we could not resolve, assume it may */
6508
6509 /* "tty0" means the active VC, so it may be the same sometimes */
6510 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6511 }
6512
6513 static bool exec_context_may_touch_tty(const ExecContext *ec) {
6514 assert(ec);
6515
6516 return ec->tty_reset ||
6517 ec->tty_vhangup ||
6518 ec->tty_vt_disallocate ||
6519 is_terminal_input(ec->std_input) ||
6520 is_terminal_output(ec->std_output) ||
6521 is_terminal_output(ec->std_error);
6522 }
6523
6524 bool exec_context_may_touch_console(const ExecContext *ec) {
6525
6526 return exec_context_may_touch_tty(ec) &&
6527 tty_may_match_dev_console(exec_context_tty_path(ec));
6528 }
6529
6530 static void strv_fprintf(FILE *f, char **l) {
6531 assert(f);
6532
6533 STRV_FOREACH(g, l)
6534 fprintf(f, " %s", *g);
6535 }
6536
6537 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6538 assert(f);
6539 assert(prefix);
6540 assert(name);
6541
6542 if (!strv_isempty(strv)) {
6543 fprintf(f, "%s%s:", prefix, name);
6544 strv_fprintf(f, strv);
6545 fputs("\n", f);
6546 }
6547 }
6548
6549 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
6550 int r;
6551
6552 assert(c);
6553 assert(f);
6554
6555 prefix = strempty(prefix);
6556
6557 fprintf(f,
6558 "%sUMask: %04o\n"
6559 "%sWorkingDirectory: %s\n"
6560 "%sRootDirectory: %s\n"
6561 "%sRootEphemeral: %s\n"
6562 "%sNonBlocking: %s\n"
6563 "%sPrivateTmp: %s\n"
6564 "%sPrivateDevices: %s\n"
6565 "%sProtectKernelTunables: %s\n"
6566 "%sProtectKernelModules: %s\n"
6567 "%sProtectKernelLogs: %s\n"
6568 "%sProtectClock: %s\n"
6569 "%sProtectControlGroups: %s\n"
6570 "%sPrivateNetwork: %s\n"
6571 "%sPrivateUsers: %s\n"
6572 "%sProtectHome: %s\n"
6573 "%sProtectSystem: %s\n"
6574 "%sMountAPIVFS: %s\n"
6575 "%sIgnoreSIGPIPE: %s\n"
6576 "%sMemoryDenyWriteExecute: %s\n"
6577 "%sRestrictRealtime: %s\n"
6578 "%sRestrictSUIDSGID: %s\n"
6579 "%sKeyringMode: %s\n"
6580 "%sProtectHostname: %s\n"
6581 "%sProtectProc: %s\n"
6582 "%sProcSubset: %s\n",
6583 prefix, c->umask,
6584 prefix, empty_to_root(c->working_directory),
6585 prefix, empty_to_root(c->root_directory),
6586 prefix, yes_no(c->root_ephemeral),
6587 prefix, yes_no(c->non_blocking),
6588 prefix, yes_no(c->private_tmp),
6589 prefix, yes_no(c->private_devices),
6590 prefix, yes_no(c->protect_kernel_tunables),
6591 prefix, yes_no(c->protect_kernel_modules),
6592 prefix, yes_no(c->protect_kernel_logs),
6593 prefix, yes_no(c->protect_clock),
6594 prefix, yes_no(c->protect_control_groups),
6595 prefix, yes_no(c->private_network),
6596 prefix, yes_no(c->private_users),
6597 prefix, protect_home_to_string(c->protect_home),
6598 prefix, protect_system_to_string(c->protect_system),
6599 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
6600 prefix, yes_no(c->ignore_sigpipe),
6601 prefix, yes_no(c->memory_deny_write_execute),
6602 prefix, yes_no(c->restrict_realtime),
6603 prefix, yes_no(c->restrict_suid_sgid),
6604 prefix, exec_keyring_mode_to_string(c->keyring_mode),
6605 prefix, yes_no(c->protect_hostname),
6606 prefix, protect_proc_to_string(c->protect_proc),
6607 prefix, proc_subset_to_string(c->proc_subset));
6608
6609 if (c->root_image)
6610 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6611
6612 if (c->root_image_options) {
6613 fprintf(f, "%sRootImageOptions:", prefix);
6614 LIST_FOREACH(mount_options, o, c->root_image_options)
6615 if (!isempty(o->options))
6616 fprintf(f, " %s:%s",
6617 partition_designator_to_string(o->partition_designator),
6618 o->options);
6619 fprintf(f, "\n");
6620 }
6621
6622 if (c->root_hash) {
6623 _cleanup_free_ char *encoded = NULL;
6624 encoded = hexmem(c->root_hash, c->root_hash_size);
6625 if (encoded)
6626 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6627 }
6628
6629 if (c->root_hash_path)
6630 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6631
6632 if (c->root_hash_sig) {
6633 _cleanup_free_ char *encoded = NULL;
6634 ssize_t len;
6635 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6636 if (len)
6637 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6638 }
6639
6640 if (c->root_hash_sig_path)
6641 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6642
6643 if (c->root_verity)
6644 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6645
6646 STRV_FOREACH(e, c->environment)
6647 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6648
6649 STRV_FOREACH(e, c->environment_files)
6650 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
6651
6652 STRV_FOREACH(e, c->pass_environment)
6653 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6654
6655 STRV_FOREACH(e, c->unset_environment)
6656 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6657
6658 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6659
6660 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
6661 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6662
6663 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6664 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6665
6666 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6667 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6668 }
6669 }
6670
6671 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6672
6673 if (c->nice_set)
6674 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6675
6676 if (c->oom_score_adjust_set)
6677 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6678
6679 if (c->coredump_filter_set)
6680 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6681
6682 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6683 if (c->rlimit[i]) {
6684 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6685 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6686 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6687 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6688 }
6689
6690 if (c->ioprio_set) {
6691 _cleanup_free_ char *class_str = NULL;
6692
6693 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6694 if (r >= 0)
6695 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6696
6697 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6698 }
6699
6700 if (c->cpu_sched_set) {
6701 _cleanup_free_ char *policy_str = NULL;
6702
6703 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6704 if (r >= 0)
6705 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6706
6707 fprintf(f,
6708 "%sCPUSchedulingPriority: %i\n"
6709 "%sCPUSchedulingResetOnFork: %s\n",
6710 prefix, c->cpu_sched_priority,
6711 prefix, yes_no(c->cpu_sched_reset_on_fork));
6712 }
6713
6714 if (c->cpu_set.set) {
6715 _cleanup_free_ char *affinity = NULL;
6716
6717 affinity = cpu_set_to_range_string(&c->cpu_set);
6718 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6719 }
6720
6721 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6722 _cleanup_free_ char *nodes = NULL;
6723
6724 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6725 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6726 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6727 }
6728
6729 if (c->timer_slack_nsec != NSEC_INFINITY)
6730 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6731
6732 fprintf(f,
6733 "%sStandardInput: %s\n"
6734 "%sStandardOutput: %s\n"
6735 "%sStandardError: %s\n",
6736 prefix, exec_input_to_string(c->std_input),
6737 prefix, exec_output_to_string(c->std_output),
6738 prefix, exec_output_to_string(c->std_error));
6739
6740 if (c->std_input == EXEC_INPUT_NAMED_FD)
6741 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6742 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6743 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6744 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6745 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6746
6747 if (c->std_input == EXEC_INPUT_FILE)
6748 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6749 if (c->std_output == EXEC_OUTPUT_FILE)
6750 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6751 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6752 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6753 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6754 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6755 if (c->std_error == EXEC_OUTPUT_FILE)
6756 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6757 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6758 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6759 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6760 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6761
6762 if (c->tty_path)
6763 fprintf(f,
6764 "%sTTYPath: %s\n"
6765 "%sTTYReset: %s\n"
6766 "%sTTYVHangup: %s\n"
6767 "%sTTYVTDisallocate: %s\n"
6768 "%sTTYRows: %u\n"
6769 "%sTTYColumns: %u\n",
6770 prefix, c->tty_path,
6771 prefix, yes_no(c->tty_reset),
6772 prefix, yes_no(c->tty_vhangup),
6773 prefix, yes_no(c->tty_vt_disallocate),
6774 prefix, c->tty_rows,
6775 prefix, c->tty_cols);
6776
6777 if (IN_SET(c->std_output,
6778 EXEC_OUTPUT_KMSG,
6779 EXEC_OUTPUT_JOURNAL,
6780 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6781 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6782 IN_SET(c->std_error,
6783 EXEC_OUTPUT_KMSG,
6784 EXEC_OUTPUT_JOURNAL,
6785 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6786 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6787
6788 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6789
6790 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6791 if (r >= 0)
6792 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6793
6794 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6795 if (r >= 0)
6796 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6797 }
6798
6799 if (c->log_level_max >= 0) {
6800 _cleanup_free_ char *t = NULL;
6801
6802 (void) log_level_to_string_alloc(c->log_level_max, &t);
6803
6804 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6805 }
6806
6807 if (c->log_ratelimit_interval_usec > 0)
6808 fprintf(f,
6809 "%sLogRateLimitIntervalSec: %s\n",
6810 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6811
6812 if (c->log_ratelimit_burst > 0)
6813 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6814
6815 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6816 fprintf(f, "%sLogFilterPatterns:", prefix);
6817
6818 char *pattern;
6819 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6820 fprintf(f, " %s", pattern);
6821 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6822 fprintf(f, " ~%s", pattern);
6823 fputc('\n', f);
6824 }
6825
6826 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6827 fprintf(f, "%sLogExtraFields: ", prefix);
6828 fwrite(c->log_extra_fields[j].iov_base,
6829 1, c->log_extra_fields[j].iov_len,
6830 f);
6831 fputc('\n', f);
6832 }
6833
6834 if (c->log_namespace)
6835 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6836
6837 if (c->secure_bits) {
6838 _cleanup_free_ char *str = NULL;
6839
6840 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6841 if (r >= 0)
6842 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6843 }
6844
6845 if (c->capability_bounding_set != CAP_MASK_UNSET) {
6846 _cleanup_free_ char *str = NULL;
6847
6848 r = capability_set_to_string(c->capability_bounding_set, &str);
6849 if (r >= 0)
6850 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6851 }
6852
6853 if (c->capability_ambient_set != 0) {
6854 _cleanup_free_ char *str = NULL;
6855
6856 r = capability_set_to_string(c->capability_ambient_set, &str);
6857 if (r >= 0)
6858 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6859 }
6860
6861 if (c->user)
6862 fprintf(f, "%sUser: %s\n", prefix, c->user);
6863 if (c->group)
6864 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6865
6866 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6867
6868 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6869
6870 if (c->pam_name)
6871 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6872
6873 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6874 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6875 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6876 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6877 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6878 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6879
6880 for (size_t i = 0; i < c->n_bind_mounts; i++)
6881 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6882 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6883 c->bind_mounts[i].ignore_enoent ? "-": "",
6884 c->bind_mounts[i].source,
6885 c->bind_mounts[i].destination,
6886 c->bind_mounts[i].recursive ? "rbind" : "norbind");
6887
6888 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6889 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6890
6891 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6892 t->path,
6893 isempty(t->options) ? "" : ":",
6894 strempty(t->options));
6895 }
6896
6897 if (c->utmp_id)
6898 fprintf(f,
6899 "%sUtmpIdentifier: %s\n",
6900 prefix, c->utmp_id);
6901
6902 if (c->selinux_context)
6903 fprintf(f,
6904 "%sSELinuxContext: %s%s\n",
6905 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6906
6907 if (c->apparmor_profile)
6908 fprintf(f,
6909 "%sAppArmorProfile: %s%s\n",
6910 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6911
6912 if (c->smack_process_label)
6913 fprintf(f,
6914 "%sSmackProcessLabel: %s%s\n",
6915 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6916
6917 if (c->personality != PERSONALITY_INVALID)
6918 fprintf(f,
6919 "%sPersonality: %s\n",
6920 prefix, strna(personality_to_string(c->personality)));
6921
6922 fprintf(f,
6923 "%sLockPersonality: %s\n",
6924 prefix, yes_no(c->lock_personality));
6925
6926 if (c->syscall_filter) {
6927 fprintf(f,
6928 "%sSystemCallFilter: ",
6929 prefix);
6930
6931 if (!c->syscall_allow_list)
6932 fputc('~', f);
6933
6934 #if HAVE_SECCOMP
6935 void *id, *val;
6936 bool first = true;
6937 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6938 _cleanup_free_ char *name = NULL;
6939 const char *errno_name = NULL;
6940 int num = PTR_TO_INT(val);
6941
6942 if (first)
6943 first = false;
6944 else
6945 fputc(' ', f);
6946
6947 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6948 fputs(strna(name), f);
6949
6950 if (num >= 0) {
6951 errno_name = seccomp_errno_or_action_to_string(num);
6952 if (errno_name)
6953 fprintf(f, ":%s", errno_name);
6954 else
6955 fprintf(f, ":%d", num);
6956 }
6957 }
6958 #endif
6959
6960 fputc('\n', f);
6961 }
6962
6963 if (c->syscall_archs) {
6964 fprintf(f,
6965 "%sSystemCallArchitectures:",
6966 prefix);
6967
6968 #if HAVE_SECCOMP
6969 void *id;
6970 SET_FOREACH(id, c->syscall_archs)
6971 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6972 #endif
6973 fputc('\n', f);
6974 }
6975
6976 if (exec_context_restrict_namespaces_set(c)) {
6977 _cleanup_free_ char *s = NULL;
6978
6979 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6980 if (r >= 0)
6981 fprintf(f, "%sRestrictNamespaces: %s\n",
6982 prefix, strna(s));
6983 }
6984
6985 #if HAVE_LIBBPF
6986 if (exec_context_restrict_filesystems_set(c)) {
6987 char *fs;
6988 SET_FOREACH(fs, c->restrict_filesystems)
6989 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6990 }
6991 #endif
6992
6993 if (c->network_namespace_path)
6994 fprintf(f,
6995 "%sNetworkNamespacePath: %s\n",
6996 prefix, c->network_namespace_path);
6997
6998 if (c->syscall_errno > 0) {
6999 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
7000
7001 #if HAVE_SECCOMP
7002 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
7003 if (errno_name)
7004 fputs(errno_name, f);
7005 else
7006 fprintf(f, "%d", c->syscall_errno);
7007 #endif
7008 fputc('\n', f);
7009 }
7010
7011 for (size_t i = 0; i < c->n_mount_images; i++) {
7012 fprintf(f, "%sMountImages: %s%s:%s", prefix,
7013 c->mount_images[i].ignore_enoent ? "-": "",
7014 c->mount_images[i].source,
7015 c->mount_images[i].destination);
7016 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
7017 fprintf(f, ":%s:%s",
7018 partition_designator_to_string(o->partition_designator),
7019 strempty(o->options));
7020 fprintf(f, "\n");
7021 }
7022
7023 for (size_t i = 0; i < c->n_extension_images; i++) {
7024 fprintf(f, "%sExtensionImages: %s%s", prefix,
7025 c->extension_images[i].ignore_enoent ? "-": "",
7026 c->extension_images[i].source);
7027 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
7028 fprintf(f, ":%s:%s",
7029 partition_designator_to_string(o->partition_designator),
7030 strempty(o->options));
7031 fprintf(f, "\n");
7032 }
7033
7034 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
7035 }
7036
7037 bool exec_context_maintains_privileges(const ExecContext *c) {
7038 assert(c);
7039
7040 /* Returns true if the process forked off would run under
7041 * an unchanged UID or as root. */
7042
7043 if (!c->user)
7044 return true;
7045
7046 if (streq(c->user, "root") || streq(c->user, "0"))
7047 return true;
7048
7049 return false;
7050 }
7051
7052 int exec_context_get_effective_ioprio(const ExecContext *c) {
7053 int p;
7054
7055 assert(c);
7056
7057 if (c->ioprio_set)
7058 return c->ioprio;
7059
7060 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
7061 if (p < 0)
7062 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7063
7064 return ioprio_normalize(p);
7065 }
7066
7067 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
7068 assert(c);
7069
7070 /* Explicit setting wins */
7071 if (c->mount_apivfs_set)
7072 return c->mount_apivfs;
7073
7074 /* Default to "yes" if root directory or image are specified */
7075 if (exec_context_with_rootfs(c))
7076 return true;
7077
7078 return false;
7079 }
7080
7081 void exec_context_free_log_extra_fields(ExecContext *c) {
7082 assert(c);
7083
7084 for (size_t l = 0; l < c->n_log_extra_fields; l++)
7085 free(c->log_extra_fields[l].iov_base);
7086 c->log_extra_fields = mfree(c->log_extra_fields);
7087 c->n_log_extra_fields = 0;
7088 }
7089
7090 void exec_context_revert_tty(ExecContext *c) {
7091 _cleanup_close_ int fd = -EBADF;
7092 const char *path;
7093 struct stat st;
7094 int r;
7095
7096 assert(c);
7097
7098 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7099 exec_context_tty_reset(c, NULL);
7100
7101 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7102 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7103 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7104 if (!exec_context_may_touch_tty(c))
7105 return;
7106
7107 path = exec_context_tty_path(c);
7108 if (!path)
7109 return;
7110
7111 fd = open(path, O_PATH|O_CLOEXEC);
7112 if (fd < 0)
7113 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
7114 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7115 path);
7116
7117 if (fstat(fd, &st) < 0)
7118 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
7119
7120 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7121 * if things are a character device, since a proper check either means we'd have to open the TTY and
7122 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7123 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7124 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7125 if (!S_ISCHR(st.st_mode))
7126 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
7127
7128 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
7129 if (r < 0)
7130 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
7131 }
7132
7133 int exec_context_get_clean_directories(
7134 ExecContext *c,
7135 char **prefix,
7136 ExecCleanMask mask,
7137 char ***ret) {
7138
7139 _cleanup_strv_free_ char **l = NULL;
7140 int r;
7141
7142 assert(c);
7143 assert(prefix);
7144 assert(ret);
7145
7146 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
7147 if (!FLAGS_SET(mask, 1U << t))
7148 continue;
7149
7150 if (!prefix[t])
7151 continue;
7152
7153 for (size_t i = 0; i < c->directories[t].n_items; i++) {
7154 char *j;
7155
7156 j = path_join(prefix[t], c->directories[t].items[i].path);
7157 if (!j)
7158 return -ENOMEM;
7159
7160 r = strv_consume(&l, j);
7161 if (r < 0)
7162 return r;
7163
7164 /* Also remove private directories unconditionally. */
7165 if (t != EXEC_DIRECTORY_CONFIGURATION) {
7166 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
7167 if (!j)
7168 return -ENOMEM;
7169
7170 r = strv_consume(&l, j);
7171 if (r < 0)
7172 return r;
7173 }
7174
7175 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
7176 j = path_join(prefix[t], *symlink);
7177 if (!j)
7178 return -ENOMEM;
7179
7180 r = strv_consume(&l, j);
7181 if (r < 0)
7182 return r;
7183 }
7184 }
7185 }
7186
7187 *ret = TAKE_PTR(l);
7188 return 0;
7189 }
7190
7191 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
7192 ExecCleanMask mask = 0;
7193
7194 assert(c);
7195 assert(ret);
7196
7197 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
7198 if (c->directories[t].n_items > 0)
7199 mask |= 1U << t;
7200
7201 *ret = mask;
7202 return 0;
7203 }
7204
7205 bool exec_context_has_encrypted_credentials(ExecContext *c) {
7206 ExecLoadCredential *load_cred;
7207 ExecSetCredential *set_cred;
7208
7209 assert(c);
7210
7211 HASHMAP_FOREACH(load_cred, c->load_credentials)
7212 if (load_cred->encrypted)
7213 return true;
7214
7215 HASHMAP_FOREACH(set_cred, c->set_credentials)
7216 if (set_cred->encrypted)
7217 return true;
7218
7219 return false;
7220 }
7221
7222 int exec_context_add_default_dependencies(Unit *u, const ExecContext *c) {
7223 assert(u);
7224 assert(u->default_dependencies);
7225
7226 if (c && exec_context_needs_term(c))
7227 return unit_add_dependency_by_name(u, UNIT_AFTER, SPECIAL_VCONSOLE_SETUP_SERVICE,
7228 /* add_reference= */ true, UNIT_DEPENDENCY_DEFAULT);
7229 return 0;
7230 }
7231
7232 void exec_status_start(ExecStatus *s, pid_t pid) {
7233 assert(s);
7234
7235 *s = (ExecStatus) {
7236 .pid = pid,
7237 };
7238
7239 dual_timestamp_get(&s->start_timestamp);
7240 }
7241
7242 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
7243 assert(s);
7244
7245 if (s->pid != pid)
7246 *s = (ExecStatus) {
7247 .pid = pid,
7248 };
7249
7250 dual_timestamp_get(&s->exit_timestamp);
7251
7252 s->code = code;
7253 s->status = status;
7254
7255 if (context && context->utmp_id)
7256 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
7257 }
7258
7259 void exec_status_reset(ExecStatus *s) {
7260 assert(s);
7261
7262 *s = (ExecStatus) {};
7263 }
7264
7265 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
7266 assert(s);
7267 assert(f);
7268
7269 if (s->pid <= 0)
7270 return;
7271
7272 prefix = strempty(prefix);
7273
7274 fprintf(f,
7275 "%sPID: "PID_FMT"\n",
7276 prefix, s->pid);
7277
7278 if (dual_timestamp_is_set(&s->start_timestamp))
7279 fprintf(f,
7280 "%sStart Timestamp: %s\n",
7281 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
7282
7283 if (dual_timestamp_is_set(&s->exit_timestamp))
7284 fprintf(f,
7285 "%sExit Timestamp: %s\n"
7286 "%sExit Code: %s\n"
7287 "%sExit Status: %i\n",
7288 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
7289 prefix, sigchld_code_to_string(s->code),
7290 prefix, s->status);
7291 }
7292
7293 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
7294 _cleanup_free_ char *cmd = NULL;
7295 const char *prefix2;
7296
7297 assert(c);
7298 assert(f);
7299
7300 prefix = strempty(prefix);
7301 prefix2 = strjoina(prefix, "\t");
7302
7303 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
7304
7305 fprintf(f,
7306 "%sCommand Line: %s\n",
7307 prefix, strnull(cmd));
7308
7309 exec_status_dump(&c->exec_status, f, prefix2);
7310 }
7311
7312 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
7313 assert(f);
7314
7315 prefix = strempty(prefix);
7316
7317 LIST_FOREACH(command, i, c)
7318 exec_command_dump(i, f, prefix);
7319 }
7320
7321 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
7322 ExecCommand *end;
7323
7324 assert(l);
7325 assert(e);
7326
7327 if (*l) {
7328 /* It's kind of important, that we keep the order here */
7329 end = LIST_FIND_TAIL(command, *l);
7330 LIST_INSERT_AFTER(command, *l, end, e);
7331 } else
7332 *l = e;
7333 }
7334
7335 int exec_command_set(ExecCommand *c, const char *path, ...) {
7336 va_list ap;
7337 char **l, *p;
7338
7339 assert(c);
7340 assert(path);
7341
7342 va_start(ap, path);
7343 l = strv_new_ap(path, ap);
7344 va_end(ap);
7345
7346 if (!l)
7347 return -ENOMEM;
7348
7349 p = strdup(path);
7350 if (!p) {
7351 strv_free(l);
7352 return -ENOMEM;
7353 }
7354
7355 free_and_replace(c->path, p);
7356
7357 return strv_free_and_replace(c->argv, l);
7358 }
7359
7360 int exec_command_append(ExecCommand *c, const char *path, ...) {
7361 _cleanup_strv_free_ char **l = NULL;
7362 va_list ap;
7363 int r;
7364
7365 assert(c);
7366 assert(path);
7367
7368 va_start(ap, path);
7369 l = strv_new_ap(path, ap);
7370 va_end(ap);
7371
7372 if (!l)
7373 return -ENOMEM;
7374
7375 r = strv_extend_strv(&c->argv, l, false);
7376 if (r < 0)
7377 return r;
7378
7379 return 0;
7380 }
7381
7382 static char *destroy_tree(char *path) {
7383 if (!path)
7384 return NULL;
7385
7386 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
7387 log_debug("Spawning process to nuke '%s'", path);
7388
7389 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
7390 }
7391
7392 return mfree(path);
7393 }
7394
7395 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
7396 if (!rt)
7397 return NULL;
7398
7399 if (rt->manager)
7400 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
7401
7402 rt->id = mfree(rt->id);
7403 rt->tmp_dir = mfree(rt->tmp_dir);
7404 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
7405 safe_close_pair(rt->netns_storage_socket);
7406 safe_close_pair(rt->ipcns_storage_socket);
7407 return mfree(rt);
7408 }
7409
7410 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
7411 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
7412
7413 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
7414 if (!rt)
7415 return NULL;
7416
7417 assert(rt->n_ref > 0);
7418 rt->n_ref--;
7419
7420 if (rt->n_ref > 0)
7421 return NULL;
7422
7423 rt->tmp_dir = destroy_tree(rt->tmp_dir);
7424 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
7425
7426 return exec_shared_runtime_free(rt);
7427 }
7428
7429 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
7430 _cleanup_free_ char *id_copy = NULL;
7431 ExecSharedRuntime *n;
7432
7433 assert(ret);
7434
7435 id_copy = strdup(id);
7436 if (!id_copy)
7437 return -ENOMEM;
7438
7439 n = new(ExecSharedRuntime, 1);
7440 if (!n)
7441 return -ENOMEM;
7442
7443 *n = (ExecSharedRuntime) {
7444 .id = TAKE_PTR(id_copy),
7445 .netns_storage_socket = PIPE_EBADF,
7446 .ipcns_storage_socket = PIPE_EBADF,
7447 };
7448
7449 *ret = n;
7450 return 0;
7451 }
7452
7453 static int exec_shared_runtime_add(
7454 Manager *m,
7455 const char *id,
7456 char **tmp_dir,
7457 char **var_tmp_dir,
7458 int netns_storage_socket[2],
7459 int ipcns_storage_socket[2],
7460 ExecSharedRuntime **ret) {
7461
7462 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
7463 int r;
7464
7465 assert(m);
7466 assert(id);
7467
7468 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7469
7470 r = exec_shared_runtime_allocate(&rt, id);
7471 if (r < 0)
7472 return r;
7473
7474 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
7475 if (r < 0)
7476 return r;
7477
7478 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
7479 rt->tmp_dir = TAKE_PTR(*tmp_dir);
7480 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
7481
7482 if (netns_storage_socket) {
7483 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
7484 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
7485 }
7486
7487 if (ipcns_storage_socket) {
7488 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
7489 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
7490 }
7491
7492 rt->manager = m;
7493
7494 if (ret)
7495 *ret = rt;
7496 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7497 TAKE_PTR(rt);
7498 return 0;
7499 }
7500
7501 static int exec_shared_runtime_make(
7502 Manager *m,
7503 const ExecContext *c,
7504 const char *id,
7505 ExecSharedRuntime **ret) {
7506
7507 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
7508 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
7509 int r;
7510
7511 assert(m);
7512 assert(c);
7513 assert(id);
7514
7515 /* It is not necessary to create ExecSharedRuntime object. */
7516 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
7517 *ret = NULL;
7518 return 0;
7519 }
7520
7521 if (c->private_tmp &&
7522 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
7523 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
7524 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
7525 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
7526 if (r < 0)
7527 return r;
7528 }
7529
7530 if (exec_needs_network_namespace(c)) {
7531 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
7532 return -errno;
7533 }
7534
7535 if (exec_needs_ipc_namespace(c)) {
7536 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
7537 return -errno;
7538 }
7539
7540 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
7541 if (r < 0)
7542 return r;
7543
7544 return 1;
7545 }
7546
7547 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
7548 ExecSharedRuntime *rt;
7549 int r;
7550
7551 assert(m);
7552 assert(id);
7553 assert(ret);
7554
7555 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
7556 if (rt)
7557 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7558 goto ref;
7559
7560 if (!create) {
7561 *ret = NULL;
7562 return 0;
7563 }
7564
7565 /* If not found, then create a new object. */
7566 r = exec_shared_runtime_make(m, c, id, &rt);
7567 if (r < 0)
7568 return r;
7569 if (r == 0) {
7570 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7571 *ret = NULL;
7572 return 0;
7573 }
7574
7575 ref:
7576 /* increment reference counter. */
7577 rt->n_ref++;
7578 *ret = rt;
7579 return 1;
7580 }
7581
7582 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7583 ExecSharedRuntime *rt;
7584
7585 assert(m);
7586 assert(f);
7587 assert(fds);
7588
7589 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7590 fprintf(f, "exec-runtime=%s", rt->id);
7591
7592 if (rt->tmp_dir)
7593 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
7594
7595 if (rt->var_tmp_dir)
7596 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
7597
7598 if (rt->netns_storage_socket[0] >= 0) {
7599 int copy;
7600
7601 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7602 if (copy < 0)
7603 return copy;
7604
7605 fprintf(f, " netns-socket-0=%i", copy);
7606 }
7607
7608 if (rt->netns_storage_socket[1] >= 0) {
7609 int copy;
7610
7611 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7612 if (copy < 0)
7613 return copy;
7614
7615 fprintf(f, " netns-socket-1=%i", copy);
7616 }
7617
7618 if (rt->ipcns_storage_socket[0] >= 0) {
7619 int copy;
7620
7621 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7622 if (copy < 0)
7623 return copy;
7624
7625 fprintf(f, " ipcns-socket-0=%i", copy);
7626 }
7627
7628 if (rt->ipcns_storage_socket[1] >= 0) {
7629 int copy;
7630
7631 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7632 if (copy < 0)
7633 return copy;
7634
7635 fprintf(f, " ipcns-socket-1=%i", copy);
7636 }
7637
7638 fputc('\n', f);
7639 }
7640
7641 return 0;
7642 }
7643
7644 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7645 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
7646 ExecSharedRuntime *rt;
7647 int r;
7648
7649 /* This is for the migration from old (v237 or earlier) deserialization text.
7650 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7651 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7652 * so or not from the serialized text, then we always creates a new object owned by this. */
7653
7654 assert(u);
7655 assert(key);
7656 assert(value);
7657
7658 /* Manager manages ExecSharedRuntime objects by the unit id.
7659 * So, we omit the serialized text when the unit does not have id (yet?)... */
7660 if (isempty(u->id)) {
7661 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7662 return 0;
7663 }
7664
7665 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
7666 return log_oom();
7667
7668 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
7669 if (!rt) {
7670 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
7671 return log_oom();
7672
7673 rt = rt_create;
7674 }
7675
7676 if (streq(key, "tmp-dir")) {
7677 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7678 return -ENOMEM;
7679
7680 } else if (streq(key, "var-tmp-dir")) {
7681 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7682 return -ENOMEM;
7683
7684 } else if (streq(key, "netns-socket-0")) {
7685 int fd;
7686
7687 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7688 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7689 return 0;
7690 }
7691
7692 safe_close(rt->netns_storage_socket[0]);
7693 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7694
7695 } else if (streq(key, "netns-socket-1")) {
7696 int fd;
7697
7698 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7699 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7700 return 0;
7701 }
7702
7703 safe_close(rt->netns_storage_socket[1]);
7704 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7705
7706 } else
7707 return 0;
7708
7709 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7710 if (rt_create) {
7711 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
7712 if (r < 0) {
7713 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7714 return 0;
7715 }
7716
7717 rt_create->manager = u->manager;
7718
7719 /* Avoid cleanup */
7720 TAKE_PTR(rt_create);
7721 }
7722
7723 return 1;
7724 }
7725
7726 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7727 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7728 char *id = NULL;
7729 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7730 const char *p, *v = ASSERT_PTR(value);
7731 size_t n;
7732
7733 assert(m);
7734 assert(fds);
7735
7736 n = strcspn(v, " ");
7737 id = strndupa_safe(v, n);
7738 if (v[n] != ' ')
7739 goto finalize;
7740 p = v + n + 1;
7741
7742 v = startswith(p, "tmp-dir=");
7743 if (v) {
7744 n = strcspn(v, " ");
7745 tmp_dir = strndup(v, n);
7746 if (!tmp_dir)
7747 return log_oom();
7748 if (v[n] != ' ')
7749 goto finalize;
7750 p = v + n + 1;
7751 }
7752
7753 v = startswith(p, "var-tmp-dir=");
7754 if (v) {
7755 n = strcspn(v, " ");
7756 var_tmp_dir = strndup(v, n);
7757 if (!var_tmp_dir)
7758 return log_oom();
7759 if (v[n] != ' ')
7760 goto finalize;
7761 p = v + n + 1;
7762 }
7763
7764 v = startswith(p, "netns-socket-0=");
7765 if (v) {
7766 char *buf;
7767
7768 n = strcspn(v, " ");
7769 buf = strndupa_safe(v, n);
7770
7771 netns_fdpair[0] = parse_fd(buf);
7772 if (netns_fdpair[0] < 0)
7773 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7774 if (!fdset_contains(fds, netns_fdpair[0]))
7775 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7776 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7777 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7778 if (v[n] != ' ')
7779 goto finalize;
7780 p = v + n + 1;
7781 }
7782
7783 v = startswith(p, "netns-socket-1=");
7784 if (v) {
7785 char *buf;
7786
7787 n = strcspn(v, " ");
7788 buf = strndupa_safe(v, n);
7789
7790 netns_fdpair[1] = parse_fd(buf);
7791 if (netns_fdpair[1] < 0)
7792 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7793 if (!fdset_contains(fds, netns_fdpair[1]))
7794 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7795 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7796 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7797 if (v[n] != ' ')
7798 goto finalize;
7799 p = v + n + 1;
7800 }
7801
7802 v = startswith(p, "ipcns-socket-0=");
7803 if (v) {
7804 char *buf;
7805
7806 n = strcspn(v, " ");
7807 buf = strndupa_safe(v, n);
7808
7809 ipcns_fdpair[0] = parse_fd(buf);
7810 if (ipcns_fdpair[0] < 0)
7811 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7812 if (!fdset_contains(fds, ipcns_fdpair[0]))
7813 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7814 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7815 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7816 if (v[n] != ' ')
7817 goto finalize;
7818 p = v + n + 1;
7819 }
7820
7821 v = startswith(p, "ipcns-socket-1=");
7822 if (v) {
7823 char *buf;
7824
7825 n = strcspn(v, " ");
7826 buf = strndupa_safe(v, n);
7827
7828 ipcns_fdpair[1] = parse_fd(buf);
7829 if (ipcns_fdpair[1] < 0)
7830 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7831 if (!fdset_contains(fds, ipcns_fdpair[1]))
7832 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7833 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7834 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7835 }
7836
7837 finalize:
7838 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7839 if (r < 0)
7840 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7841 return 0;
7842 }
7843
7844 void exec_shared_runtime_vacuum(Manager *m) {
7845 ExecSharedRuntime *rt;
7846
7847 assert(m);
7848
7849 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7850
7851 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7852 if (rt->n_ref > 0)
7853 continue;
7854
7855 (void) exec_shared_runtime_free(rt);
7856 }
7857 }
7858
7859 int exec_runtime_make(
7860 const Unit *unit,
7861 const ExecContext *context,
7862 ExecSharedRuntime *shared,
7863 DynamicCreds *creds,
7864 ExecRuntime **ret) {
7865 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
7866 _cleanup_free_ char *ephemeral = NULL;
7867 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
7868 int r;
7869
7870 assert(unit);
7871 assert(context);
7872 assert(ret);
7873
7874 if (!shared && !creds && !exec_needs_ephemeral(context)) {
7875 *ret = NULL;
7876 return 0;
7877 }
7878
7879 if (exec_needs_ephemeral(context)) {
7880 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7881 if (r < 0)
7882 return r;
7883
7884 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7885 if (r < 0)
7886 return r;
7887
7888 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7889 return -errno;
7890 }
7891
7892 rt = new(ExecRuntime, 1);
7893 if (!rt)
7894 return -ENOMEM;
7895
7896 *rt = (ExecRuntime) {
7897 .shared = shared,
7898 .dynamic_creds = creds,
7899 .ephemeral_copy = TAKE_PTR(ephemeral),
7900 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7901 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7902 };
7903
7904 *ret = TAKE_PTR(rt);
7905 return 1;
7906 }
7907
7908 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7909 if (!rt)
7910 return NULL;
7911
7912 exec_shared_runtime_unref(rt->shared);
7913 dynamic_creds_unref(rt->dynamic_creds);
7914
7915 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7916
7917 safe_close_pair(rt->ephemeral_storage_socket);
7918 return mfree(rt);
7919 }
7920
7921 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7922 if (!rt)
7923 return NULL;
7924
7925 rt->shared = exec_shared_runtime_destroy(rt->shared);
7926 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7927 return exec_runtime_free(rt);
7928 }
7929
7930 void exec_params_clear(ExecParameters *p) {
7931 if (!p)
7932 return;
7933
7934 p->environment = strv_free(p->environment);
7935 p->fd_names = strv_free(p->fd_names);
7936 p->fds = mfree(p->fds);
7937 p->exec_fd = safe_close(p->exec_fd);
7938 }
7939
7940 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7941 if (!sc)
7942 return NULL;
7943
7944 free(sc->id);
7945 free(sc->data);
7946 return mfree(sc);
7947 }
7948
7949 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7950 if (!lc)
7951 return NULL;
7952
7953 free(lc->id);
7954 free(lc->path);
7955 return mfree(lc);
7956 }
7957
7958 void exec_directory_done(ExecDirectory *d) {
7959 if (!d)
7960 return;
7961
7962 for (size_t i = 0; i < d->n_items; i++) {
7963 free(d->items[i].path);
7964 strv_free(d->items[i].symlinks);
7965 }
7966
7967 d->items = mfree(d->items);
7968 d->n_items = 0;
7969 d->mode = 0755;
7970 }
7971
7972 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7973 assert(d);
7974 assert(path);
7975
7976 for (size_t i = 0; i < d->n_items; i++)
7977 if (path_equal(d->items[i].path, path))
7978 return &d->items[i];
7979
7980 return NULL;
7981 }
7982
7983 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7984 _cleanup_strv_free_ char **s = NULL;
7985 _cleanup_free_ char *p = NULL;
7986 ExecDirectoryItem *existing;
7987 int r;
7988
7989 assert(d);
7990 assert(path);
7991
7992 existing = exec_directory_find(d, path);
7993 if (existing) {
7994 r = strv_extend(&existing->symlinks, symlink);
7995 if (r < 0)
7996 return r;
7997
7998 return 0; /* existing item is updated */
7999 }
8000
8001 p = strdup(path);
8002 if (!p)
8003 return -ENOMEM;
8004
8005 if (symlink) {
8006 s = strv_new(symlink);
8007 if (!s)
8008 return -ENOMEM;
8009 }
8010
8011 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
8012 return -ENOMEM;
8013
8014 d->items[d->n_items++] = (ExecDirectoryItem) {
8015 .path = TAKE_PTR(p),
8016 .symlinks = TAKE_PTR(s),
8017 };
8018
8019 return 1; /* new item is added */
8020 }
8021
8022 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
8023 assert(a);
8024 assert(b);
8025
8026 return path_compare(a->path, b->path);
8027 }
8028
8029 void exec_directory_sort(ExecDirectory *d) {
8030 assert(d);
8031
8032 /* Sort the exec directories to make always parent directories processed at first in
8033 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8034 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8035 * list. See also comments in setup_exec_directory() and issue #24783. */
8036
8037 if (d->n_items <= 1)
8038 return;
8039
8040 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
8041
8042 for (size_t i = 1; i < d->n_items; i++)
8043 for (size_t j = 0; j < i; j++)
8044 if (path_startswith(d->items[i].path, d->items[j].path)) {
8045 d->items[i].only_create = true;
8046 break;
8047 }
8048 }
8049
8050 ExecCleanMask exec_clean_mask_from_string(const char *s) {
8051 ExecDirectoryType t;
8052
8053 assert(s);
8054
8055 if (streq(s, "all"))
8056 return EXEC_CLEAN_ALL;
8057 if (streq(s, "fdstore"))
8058 return EXEC_CLEAN_FDSTORE;
8059
8060 t = exec_resource_type_from_string(s);
8061 if (t < 0)
8062 return (ExecCleanMask) t;
8063
8064 return 1U << t;
8065 }
8066
8067 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
8068 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
8069
8070 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
8071 [EXEC_INPUT_NULL] = "null",
8072 [EXEC_INPUT_TTY] = "tty",
8073 [EXEC_INPUT_TTY_FORCE] = "tty-force",
8074 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
8075 [EXEC_INPUT_SOCKET] = "socket",
8076 [EXEC_INPUT_NAMED_FD] = "fd",
8077 [EXEC_INPUT_DATA] = "data",
8078 [EXEC_INPUT_FILE] = "file",
8079 };
8080
8081 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
8082
8083 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
8084 [EXEC_OUTPUT_INHERIT] = "inherit",
8085 [EXEC_OUTPUT_NULL] = "null",
8086 [EXEC_OUTPUT_TTY] = "tty",
8087 [EXEC_OUTPUT_KMSG] = "kmsg",
8088 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
8089 [EXEC_OUTPUT_JOURNAL] = "journal",
8090 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
8091 [EXEC_OUTPUT_SOCKET] = "socket",
8092 [EXEC_OUTPUT_NAMED_FD] = "fd",
8093 [EXEC_OUTPUT_FILE] = "file",
8094 [EXEC_OUTPUT_FILE_APPEND] = "append",
8095 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
8096 };
8097
8098 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
8099
8100 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
8101 [EXEC_UTMP_INIT] = "init",
8102 [EXEC_UTMP_LOGIN] = "login",
8103 [EXEC_UTMP_USER] = "user",
8104 };
8105
8106 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
8107
8108 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
8109 [EXEC_PRESERVE_NO] = "no",
8110 [EXEC_PRESERVE_YES] = "yes",
8111 [EXEC_PRESERVE_RESTART] = "restart",
8112 };
8113
8114 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
8115
8116 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8117 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8118 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
8119 [EXEC_DIRECTORY_STATE] = "StateDirectory",
8120 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
8121 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
8122 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
8123 };
8124
8125 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
8126
8127 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8128 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8129 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
8130 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
8131 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
8132 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
8133 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
8134 };
8135
8136 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
8137
8138 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8139 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8140 * directories, specifically .timer units with their timestamp touch file. */
8141 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8142 [EXEC_DIRECTORY_RUNTIME] = "runtime",
8143 [EXEC_DIRECTORY_STATE] = "state",
8144 [EXEC_DIRECTORY_CACHE] = "cache",
8145 [EXEC_DIRECTORY_LOGS] = "logs",
8146 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
8147 };
8148
8149 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
8150
8151 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8152 * the service payload in. */
8153 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8154 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
8155 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
8156 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
8157 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
8158 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
8159 };
8160
8161 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
8162
8163 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
8164 [EXEC_KEYRING_INHERIT] = "inherit",
8165 [EXEC_KEYRING_PRIVATE] = "private",
8166 [EXEC_KEYRING_SHARED] = "shared",
8167 };
8168
8169 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);