]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #28269 from yuwata/udev-builtin-net_id-cleanups-part1
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/mount.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
20 #if HAVE_PAM
21 #include <security/pam_appl.h>
22 #endif
23
24 #if HAVE_SELINUX
25 #include <selinux/selinux.h>
26 #endif
27
28 #if HAVE_SECCOMP
29 #include <seccomp.h>
30 #endif
31
32 #if HAVE_APPARMOR
33 #include <sys/apparmor.h>
34 #endif
35
36 #include "sd-messages.h"
37
38 #include "acl-util.h"
39 #include "af-list.h"
40 #include "alloc-util.h"
41 #if HAVE_APPARMOR
42 #include "apparmor-util.h"
43 #endif
44 #include "argv-util.h"
45 #include "async.h"
46 #include "barrier.h"
47 #include "bpf-lsm.h"
48 #include "btrfs-util.h"
49 #include "cap-list.h"
50 #include "capability-util.h"
51 #include "chattr-util.h"
52 #include "cgroup-setup.h"
53 #include "chase.h"
54 #include "chown-recursive.h"
55 #include "constants.h"
56 #include "cpu-set-util.h"
57 #include "creds-util.h"
58 #include "data-fd-util.h"
59 #include "env-file.h"
60 #include "env-util.h"
61 #include "errno-list.h"
62 #include "escape.h"
63 #include "execute.h"
64 #include "exit-status.h"
65 #include "fd-util.h"
66 #include "fileio.h"
67 #include "format-util.h"
68 #include "glob-util.h"
69 #include "hexdecoct.h"
70 #include "io-util.h"
71 #include "ioprio-util.h"
72 #include "label-util.h"
73 #include "lock-util.h"
74 #include "log.h"
75 #include "macro.h"
76 #include "manager.h"
77 #include "manager-dump.h"
78 #include "memory-util.h"
79 #include "missing_fs.h"
80 #include "missing_ioprio.h"
81 #include "missing_prctl.h"
82 #include "mkdir-label.h"
83 #include "mount-util.h"
84 #include "mountpoint-util.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "proc-cmdline.h"
89 #include "process-util.h"
90 #include "psi-util.h"
91 #include "random-util.h"
92 #include "recurse-dir.h"
93 #include "rlimit-util.h"
94 #include "rm-rf.h"
95 #if HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98 #include "securebits-util.h"
99 #include "selinux-util.h"
100 #include "signal-util.h"
101 #include "smack-util.h"
102 #include "socket-util.h"
103 #include "sort-util.h"
104 #include "special.h"
105 #include "stat-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
108 #include "strv.h"
109 #include "syslog-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-serialize.h"
114 #include "user-util.h"
115 #include "utmp-wtmp.h"
116
117 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
118 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
119
120 #define SNDBUF_SIZE (8*1024*1024)
121
122 static int shift_fds(int fds[], size_t n_fds) {
123 if (n_fds <= 0)
124 return 0;
125
126 /* Modifies the fds array! (sorts it) */
127
128 assert(fds);
129
130 for (int start = 0;;) {
131 int restart_from = -1;
132
133 for (int i = start; i < (int) n_fds; i++) {
134 int nfd;
135
136 /* Already at right index? */
137 if (fds[i] == i+3)
138 continue;
139
140 nfd = fcntl(fds[i], F_DUPFD, i + 3);
141 if (nfd < 0)
142 return -errno;
143
144 safe_close(fds[i]);
145 fds[i] = nfd;
146
147 /* Hmm, the fd we wanted isn't free? Then
148 * let's remember that and try again from here */
149 if (nfd != i+3 && restart_from < 0)
150 restart_from = i;
151 }
152
153 if (restart_from < 0)
154 break;
155
156 start = restart_from;
157 }
158
159 return 0;
160 }
161
162 static int flags_fds(
163 const int fds[],
164 size_t n_socket_fds,
165 size_t n_fds,
166 bool nonblock) {
167
168 int r;
169
170 if (n_fds <= 0)
171 return 0;
172
173 assert(fds);
174
175 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
176 * O_NONBLOCK only applies to socket activation though. */
177
178 for (size_t i = 0; i < n_fds; i++) {
179
180 if (i < n_socket_fds) {
181 r = fd_nonblock(fds[i], nonblock);
182 if (r < 0)
183 return r;
184 }
185
186 /* We unconditionally drop FD_CLOEXEC from the fds,
187 * since after all we want to pass these fds to our
188 * children */
189
190 r = fd_cloexec(fds[i], false);
191 if (r < 0)
192 return r;
193 }
194
195 return 0;
196 }
197
198 static const char *exec_context_tty_path(const ExecContext *context) {
199 assert(context);
200
201 if (context->stdio_as_fds)
202 return NULL;
203
204 if (context->tty_path)
205 return context->tty_path;
206
207 return "/dev/console";
208 }
209
210 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
211 unsigned rows, cols;
212 const char *tty;
213
214 assert(context);
215 assert(ret_rows);
216 assert(ret_cols);
217
218 rows = context->tty_rows;
219 cols = context->tty_cols;
220
221 tty = exec_context_tty_path(context);
222 if (tty)
223 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
224
225 *ret_rows = rows;
226 *ret_cols = cols;
227
228 return 0;
229 }
230
231 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
232 _cleanup_close_ int fd = -EBADF;
233 const char *path = exec_context_tty_path(ASSERT_PTR(context));
234
235 /* Take a lock around the device for the duration of the setup that we do here.
236 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
237 * We open a new fd that will be closed automatically, and operate on it for convenience.
238 */
239
240 if (p && p->stdin_fd >= 0) {
241 fd = xopenat_lock(p->stdin_fd, NULL,
242 O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
243 if (fd < 0)
244 return;
245 } else if (path) {
246 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
247 if (fd < 0)
248 return;
249
250 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
251 return;
252 } else
253 return; /* nothing to do */
254
255 if (context->tty_vhangup)
256 (void) terminal_vhangup_fd(fd);
257
258 if (context->tty_reset)
259 (void) reset_terminal_fd(fd, true);
260
261 if (p && p->stdin_fd >= 0) {
262 unsigned rows = context->tty_rows, cols = context->tty_cols;
263
264 (void) exec_context_tty_size(context, &rows, &cols);
265 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
266 }
267
268 if (context->tty_vt_disallocate && path)
269 (void) vt_disallocate(path);
270 }
271
272 static bool is_terminal_input(ExecInput i) {
273 return IN_SET(i,
274 EXEC_INPUT_TTY,
275 EXEC_INPUT_TTY_FORCE,
276 EXEC_INPUT_TTY_FAIL);
277 }
278
279 static bool is_terminal_output(ExecOutput o) {
280 return IN_SET(o,
281 EXEC_OUTPUT_TTY,
282 EXEC_OUTPUT_KMSG_AND_CONSOLE,
283 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
284 }
285
286 static bool is_kmsg_output(ExecOutput o) {
287 return IN_SET(o,
288 EXEC_OUTPUT_KMSG,
289 EXEC_OUTPUT_KMSG_AND_CONSOLE);
290 }
291
292 static bool exec_context_needs_term(const ExecContext *c) {
293 assert(c);
294
295 /* Return true if the execution context suggests we should set $TERM to something useful. */
296
297 if (is_terminal_input(c->std_input))
298 return true;
299
300 if (is_terminal_output(c->std_output))
301 return true;
302
303 if (is_terminal_output(c->std_error))
304 return true;
305
306 return !!c->tty_path;
307 }
308
309 static int open_null_as(int flags, int nfd) {
310 int fd;
311
312 assert(nfd >= 0);
313
314 fd = open("/dev/null", flags|O_NOCTTY);
315 if (fd < 0)
316 return -errno;
317
318 return move_fd(fd, nfd, false);
319 }
320
321 static int connect_journal_socket(
322 int fd,
323 const char *log_namespace,
324 uid_t uid,
325 gid_t gid) {
326
327 uid_t olduid = UID_INVALID;
328 gid_t oldgid = GID_INVALID;
329 const char *j;
330 int r;
331
332 j = log_namespace ?
333 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
334 "/run/systemd/journal/stdout";
335
336 if (gid_is_valid(gid)) {
337 oldgid = getgid();
338
339 if (setegid(gid) < 0)
340 return -errno;
341 }
342
343 if (uid_is_valid(uid)) {
344 olduid = getuid();
345
346 if (seteuid(uid) < 0) {
347 r = -errno;
348 goto restore_gid;
349 }
350 }
351
352 r = connect_unix_path(fd, AT_FDCWD, j);
353
354 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
355 an LSM interferes. */
356
357 if (uid_is_valid(uid))
358 (void) seteuid(olduid);
359
360 restore_gid:
361 if (gid_is_valid(gid))
362 (void) setegid(oldgid);
363
364 return r;
365 }
366
367 static int connect_logger_as(
368 const Unit *unit,
369 const ExecContext *context,
370 const ExecParameters *params,
371 ExecOutput output,
372 const char *ident,
373 int nfd,
374 uid_t uid,
375 gid_t gid) {
376
377 _cleanup_close_ int fd = -EBADF;
378 int r;
379
380 assert(context);
381 assert(params);
382 assert(output < _EXEC_OUTPUT_MAX);
383 assert(ident);
384 assert(nfd >= 0);
385
386 fd = socket(AF_UNIX, SOCK_STREAM, 0);
387 if (fd < 0)
388 return -errno;
389
390 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
391 if (r < 0)
392 return r;
393
394 if (shutdown(fd, SHUT_RD) < 0)
395 return -errno;
396
397 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
398
399 if (dprintf(fd,
400 "%s\n"
401 "%s\n"
402 "%i\n"
403 "%i\n"
404 "%i\n"
405 "%i\n"
406 "%i\n",
407 context->syslog_identifier ?: ident,
408 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
409 context->syslog_priority,
410 !!context->syslog_level_prefix,
411 false,
412 is_kmsg_output(output),
413 is_terminal_output(output)) < 0)
414 return -errno;
415
416 return move_fd(TAKE_FD(fd), nfd, false);
417 }
418
419 static int open_terminal_as(const char *path, int flags, int nfd) {
420 int fd;
421
422 assert(path);
423 assert(nfd >= 0);
424
425 fd = open_terminal(path, flags | O_NOCTTY);
426 if (fd < 0)
427 return fd;
428
429 return move_fd(fd, nfd, false);
430 }
431
432 static int acquire_path(const char *path, int flags, mode_t mode) {
433 _cleanup_close_ int fd = -EBADF;
434 int r;
435
436 assert(path);
437
438 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
439 flags |= O_CREAT;
440
441 fd = open(path, flags|O_NOCTTY, mode);
442 if (fd >= 0)
443 return TAKE_FD(fd);
444
445 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
446 return -errno;
447
448 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
449
450 fd = socket(AF_UNIX, SOCK_STREAM, 0);
451 if (fd < 0)
452 return -errno;
453
454 r = connect_unix_path(fd, AT_FDCWD, path);
455 if (IN_SET(r, -ENOTSOCK, -EINVAL))
456 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
457 * wasn't an AF_UNIX socket after all */
458 return -ENXIO;
459 if (r < 0)
460 return r;
461
462 if ((flags & O_ACCMODE) == O_RDONLY)
463 r = shutdown(fd, SHUT_WR);
464 else if ((flags & O_ACCMODE) == O_WRONLY)
465 r = shutdown(fd, SHUT_RD);
466 else
467 r = 0;
468 if (r < 0)
469 return -errno;
470
471 return TAKE_FD(fd);
472 }
473
474 static int fixup_input(
475 const ExecContext *context,
476 int socket_fd,
477 bool apply_tty_stdin) {
478
479 ExecInput std_input;
480
481 assert(context);
482
483 std_input = context->std_input;
484
485 if (is_terminal_input(std_input) && !apply_tty_stdin)
486 return EXEC_INPUT_NULL;
487
488 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
489 return EXEC_INPUT_NULL;
490
491 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
492 return EXEC_INPUT_NULL;
493
494 return std_input;
495 }
496
497 static int fixup_output(ExecOutput output, int socket_fd) {
498
499 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
500 return EXEC_OUTPUT_INHERIT;
501
502 return output;
503 }
504
505 static int setup_input(
506 const ExecContext *context,
507 const ExecParameters *params,
508 int socket_fd,
509 const int named_iofds[static 3]) {
510
511 ExecInput i;
512 int r;
513
514 assert(context);
515 assert(params);
516 assert(named_iofds);
517
518 if (params->stdin_fd >= 0) {
519 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
520 return -errno;
521
522 /* Try to make this the controlling tty, if it is a tty, and reset it */
523 if (isatty(STDIN_FILENO)) {
524 unsigned rows = context->tty_rows, cols = context->tty_cols;
525
526 (void) exec_context_tty_size(context, &rows, &cols);
527 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
528 (void) reset_terminal_fd(STDIN_FILENO, true);
529 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
530 }
531
532 return STDIN_FILENO;
533 }
534
535 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
536
537 switch (i) {
538
539 case EXEC_INPUT_NULL:
540 return open_null_as(O_RDONLY, STDIN_FILENO);
541
542 case EXEC_INPUT_TTY:
543 case EXEC_INPUT_TTY_FORCE:
544 case EXEC_INPUT_TTY_FAIL: {
545 unsigned rows, cols;
546 int fd;
547
548 fd = acquire_terminal(exec_context_tty_path(context),
549 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
550 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
551 ACQUIRE_TERMINAL_WAIT,
552 USEC_INFINITY);
553 if (fd < 0)
554 return fd;
555
556 r = exec_context_tty_size(context, &rows, &cols);
557 if (r < 0)
558 return r;
559
560 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
561 if (r < 0)
562 return r;
563
564 return move_fd(fd, STDIN_FILENO, false);
565 }
566
567 case EXEC_INPUT_SOCKET:
568 assert(socket_fd >= 0);
569
570 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
571
572 case EXEC_INPUT_NAMED_FD:
573 assert(named_iofds[STDIN_FILENO] >= 0);
574
575 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
576 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
577
578 case EXEC_INPUT_DATA: {
579 int fd;
580
581 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
582 if (fd < 0)
583 return fd;
584
585 return move_fd(fd, STDIN_FILENO, false);
586 }
587
588 case EXEC_INPUT_FILE: {
589 bool rw;
590 int fd;
591
592 assert(context->stdio_file[STDIN_FILENO]);
593
594 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
595 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
596
597 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
598 if (fd < 0)
599 return fd;
600
601 return move_fd(fd, STDIN_FILENO, false);
602 }
603
604 default:
605 assert_not_reached();
606 }
607 }
608
609 static bool can_inherit_stderr_from_stdout(
610 const ExecContext *context,
611 ExecOutput o,
612 ExecOutput e) {
613
614 assert(context);
615
616 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
617 * stderr fd */
618
619 if (e == EXEC_OUTPUT_INHERIT)
620 return true;
621 if (e != o)
622 return false;
623
624 if (e == EXEC_OUTPUT_NAMED_FD)
625 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
626
627 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
628 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
629
630 return true;
631 }
632
633 static int setup_output(
634 const Unit *unit,
635 const ExecContext *context,
636 const ExecParameters *params,
637 int fileno,
638 int socket_fd,
639 const int named_iofds[static 3],
640 const char *ident,
641 uid_t uid,
642 gid_t gid,
643 dev_t *journal_stream_dev,
644 ino_t *journal_stream_ino) {
645
646 ExecOutput o;
647 ExecInput i;
648 int r;
649
650 assert(unit);
651 assert(context);
652 assert(params);
653 assert(ident);
654 assert(journal_stream_dev);
655 assert(journal_stream_ino);
656
657 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
658
659 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
660 return -errno;
661
662 return STDOUT_FILENO;
663 }
664
665 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
666 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
667 return -errno;
668
669 return STDERR_FILENO;
670 }
671
672 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
673 o = fixup_output(context->std_output, socket_fd);
674
675 if (fileno == STDERR_FILENO) {
676 ExecOutput e;
677 e = fixup_output(context->std_error, socket_fd);
678
679 /* This expects the input and output are already set up */
680
681 /* Don't change the stderr file descriptor if we inherit all
682 * the way and are not on a tty */
683 if (e == EXEC_OUTPUT_INHERIT &&
684 o == EXEC_OUTPUT_INHERIT &&
685 i == EXEC_INPUT_NULL &&
686 !is_terminal_input(context->std_input) &&
687 getppid() != 1)
688 return fileno;
689
690 /* Duplicate from stdout if possible */
691 if (can_inherit_stderr_from_stdout(context, o, e))
692 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
693
694 o = e;
695
696 } else if (o == EXEC_OUTPUT_INHERIT) {
697 /* If input got downgraded, inherit the original value */
698 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
699 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
700
701 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
702 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
703 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
704
705 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
706 if (getppid() != 1)
707 return fileno;
708
709 /* We need to open /dev/null here anew, to get the right access mode. */
710 return open_null_as(O_WRONLY, fileno);
711 }
712
713 switch (o) {
714
715 case EXEC_OUTPUT_NULL:
716 return open_null_as(O_WRONLY, fileno);
717
718 case EXEC_OUTPUT_TTY:
719 if (is_terminal_input(i))
720 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
721
722 /* We don't reset the terminal if this is just about output */
723 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
724
725 case EXEC_OUTPUT_KMSG:
726 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
727 case EXEC_OUTPUT_JOURNAL:
728 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
729 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
730 if (r < 0) {
731 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
732 fileno == STDOUT_FILENO ? "stdout" : "stderr");
733 r = open_null_as(O_WRONLY, fileno);
734 } else {
735 struct stat st;
736
737 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
738 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
739 * services to detect whether they are connected to the journal or not.
740 *
741 * If both stdout and stderr are connected to a stream then let's make sure to store the data
742 * about STDERR as that's usually the best way to do logging. */
743
744 if (fstat(fileno, &st) >= 0 &&
745 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
746 *journal_stream_dev = st.st_dev;
747 *journal_stream_ino = st.st_ino;
748 }
749 }
750 return r;
751
752 case EXEC_OUTPUT_SOCKET:
753 assert(socket_fd >= 0);
754
755 return RET_NERRNO(dup2(socket_fd, fileno));
756
757 case EXEC_OUTPUT_NAMED_FD:
758 assert(named_iofds[fileno] >= 0);
759
760 (void) fd_nonblock(named_iofds[fileno], false);
761 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
762
763 case EXEC_OUTPUT_FILE:
764 case EXEC_OUTPUT_FILE_APPEND:
765 case EXEC_OUTPUT_FILE_TRUNCATE: {
766 bool rw;
767 int fd, flags;
768
769 assert(context->stdio_file[fileno]);
770
771 rw = context->std_input == EXEC_INPUT_FILE &&
772 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
773
774 if (rw)
775 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
776
777 flags = O_WRONLY;
778 if (o == EXEC_OUTPUT_FILE_APPEND)
779 flags |= O_APPEND;
780 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
781 flags |= O_TRUNC;
782
783 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
784 if (fd < 0)
785 return fd;
786
787 return move_fd(fd, fileno, 0);
788 }
789
790 default:
791 assert_not_reached();
792 }
793 }
794
795 static int chown_terminal(int fd, uid_t uid) {
796 int r;
797
798 assert(fd >= 0);
799
800 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
801 if (isatty(fd) < 1) {
802 if (IN_SET(errno, EINVAL, ENOTTY))
803 return 0; /* not a tty */
804
805 return -errno;
806 }
807
808 /* This might fail. What matters are the results. */
809 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
810 if (r < 0)
811 return r;
812
813 return 1;
814 }
815
816 static int setup_confirm_stdio(
817 const ExecContext *context,
818 const char *vc,
819 int *ret_saved_stdin,
820 int *ret_saved_stdout) {
821
822 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
823 unsigned rows, cols;
824 int r;
825
826 assert(ret_saved_stdin);
827 assert(ret_saved_stdout);
828
829 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
830 if (saved_stdin < 0)
831 return -errno;
832
833 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
834 if (saved_stdout < 0)
835 return -errno;
836
837 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
838 if (fd < 0)
839 return fd;
840
841 r = chown_terminal(fd, getuid());
842 if (r < 0)
843 return r;
844
845 r = reset_terminal_fd(fd, true);
846 if (r < 0)
847 return r;
848
849 r = exec_context_tty_size(context, &rows, &cols);
850 if (r < 0)
851 return r;
852
853 r = terminal_set_size_fd(fd, vc, rows, cols);
854 if (r < 0)
855 return r;
856
857 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
858 TAKE_FD(fd);
859 if (r < 0)
860 return r;
861
862 *ret_saved_stdin = TAKE_FD(saved_stdin);
863 *ret_saved_stdout = TAKE_FD(saved_stdout);
864 return 0;
865 }
866
867 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
868 assert(err < 0);
869
870 if (err == -ETIMEDOUT)
871 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
872 else {
873 errno = -err;
874 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
875 }
876 }
877
878 static void write_confirm_error(int err, const char *vc, const Unit *u) {
879 _cleanup_close_ int fd = -EBADF;
880
881 assert(vc);
882
883 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
884 if (fd < 0)
885 return;
886
887 write_confirm_error_fd(err, fd, u);
888 }
889
890 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
891 int r = 0;
892
893 assert(saved_stdin);
894 assert(saved_stdout);
895
896 release_terminal();
897
898 if (*saved_stdin >= 0)
899 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
900 r = -errno;
901
902 if (*saved_stdout >= 0)
903 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
904 r = -errno;
905
906 *saved_stdin = safe_close(*saved_stdin);
907 *saved_stdout = safe_close(*saved_stdout);
908
909 return r;
910 }
911
912 enum {
913 CONFIRM_PRETEND_FAILURE = -1,
914 CONFIRM_PRETEND_SUCCESS = 0,
915 CONFIRM_EXECUTE = 1,
916 };
917
918 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
919 int saved_stdout = -1, saved_stdin = -1, r;
920 _cleanup_free_ char *e = NULL;
921 char c;
922
923 /* For any internal errors, assume a positive response. */
924 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
925 if (r < 0) {
926 write_confirm_error(r, vc, u);
927 return CONFIRM_EXECUTE;
928 }
929
930 /* confirm_spawn might have been disabled while we were sleeping. */
931 if (manager_is_confirm_spawn_disabled(u->manager)) {
932 r = 1;
933 goto restore_stdio;
934 }
935
936 e = ellipsize(cmdline, 60, 100);
937 if (!e) {
938 log_oom();
939 r = CONFIRM_EXECUTE;
940 goto restore_stdio;
941 }
942
943 for (;;) {
944 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
945 if (r < 0) {
946 write_confirm_error_fd(r, STDOUT_FILENO, u);
947 r = CONFIRM_EXECUTE;
948 goto restore_stdio;
949 }
950
951 switch (c) {
952 case 'c':
953 printf("Resuming normal execution.\n");
954 manager_disable_confirm_spawn();
955 r = 1;
956 break;
957 case 'D':
958 unit_dump(u, stdout, " ");
959 continue; /* ask again */
960 case 'f':
961 printf("Failing execution.\n");
962 r = CONFIRM_PRETEND_FAILURE;
963 break;
964 case 'h':
965 printf(" c - continue, proceed without asking anymore\n"
966 " D - dump, show the state of the unit\n"
967 " f - fail, don't execute the command and pretend it failed\n"
968 " h - help\n"
969 " i - info, show a short summary of the unit\n"
970 " j - jobs, show jobs that are in progress\n"
971 " s - skip, don't execute the command and pretend it succeeded\n"
972 " y - yes, execute the command\n");
973 continue; /* ask again */
974 case 'i':
975 printf(" Description: %s\n"
976 " Unit: %s\n"
977 " Command: %s\n",
978 u->id, u->description, cmdline);
979 continue; /* ask again */
980 case 'j':
981 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
982 continue; /* ask again */
983 case 'n':
984 /* 'n' was removed in favor of 'f'. */
985 printf("Didn't understand 'n', did you mean 'f'?\n");
986 continue; /* ask again */
987 case 's':
988 printf("Skipping execution.\n");
989 r = CONFIRM_PRETEND_SUCCESS;
990 break;
991 case 'y':
992 r = CONFIRM_EXECUTE;
993 break;
994 default:
995 assert_not_reached();
996 }
997 break;
998 }
999
1000 restore_stdio:
1001 restore_confirm_stdio(&saved_stdin, &saved_stdout);
1002 return r;
1003 }
1004
1005 static int get_fixed_user(const ExecContext *c, const char **user,
1006 uid_t *uid, gid_t *gid,
1007 const char **home, const char **shell) {
1008 int r;
1009 const char *name;
1010
1011 assert(c);
1012
1013 if (!c->user)
1014 return 0;
1015
1016 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1017 * (i.e. are "/" or "/bin/nologin"). */
1018
1019 name = c->user;
1020 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1021 if (r < 0)
1022 return r;
1023
1024 *user = name;
1025 return 0;
1026 }
1027
1028 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1029 int r;
1030 const char *name;
1031
1032 assert(c);
1033
1034 if (!c->group)
1035 return 0;
1036
1037 name = c->group;
1038 r = get_group_creds(&name, gid, 0);
1039 if (r < 0)
1040 return r;
1041
1042 *group = name;
1043 return 0;
1044 }
1045
1046 static int get_supplementary_groups(const ExecContext *c, const char *user,
1047 const char *group, gid_t gid,
1048 gid_t **supplementary_gids, int *ngids) {
1049 int r, k = 0;
1050 int ngroups_max;
1051 bool keep_groups = false;
1052 gid_t *groups = NULL;
1053 _cleanup_free_ gid_t *l_gids = NULL;
1054
1055 assert(c);
1056
1057 /*
1058 * If user is given, then lookup GID and supplementary groups list.
1059 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1060 * here and as early as possible so we keep the list of supplementary
1061 * groups of the caller.
1062 */
1063 if (user && gid_is_valid(gid) && gid != 0) {
1064 /* First step, initialize groups from /etc/groups */
1065 if (initgroups(user, gid) < 0)
1066 return -errno;
1067
1068 keep_groups = true;
1069 }
1070
1071 if (strv_isempty(c->supplementary_groups))
1072 return 0;
1073
1074 /*
1075 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1076 * be positive, otherwise fail.
1077 */
1078 errno = 0;
1079 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1080 if (ngroups_max <= 0)
1081 return errno_or_else(EOPNOTSUPP);
1082
1083 l_gids = new(gid_t, ngroups_max);
1084 if (!l_gids)
1085 return -ENOMEM;
1086
1087 if (keep_groups) {
1088 /*
1089 * Lookup the list of groups that the user belongs to, we
1090 * avoid NSS lookups here too for gid=0.
1091 */
1092 k = ngroups_max;
1093 if (getgrouplist(user, gid, l_gids, &k) < 0)
1094 return -EINVAL;
1095 } else
1096 k = 0;
1097
1098 STRV_FOREACH(i, c->supplementary_groups) {
1099 const char *g;
1100
1101 if (k >= ngroups_max)
1102 return -E2BIG;
1103
1104 g = *i;
1105 r = get_group_creds(&g, l_gids+k, 0);
1106 if (r < 0)
1107 return r;
1108
1109 k++;
1110 }
1111
1112 /*
1113 * Sets ngids to zero to drop all supplementary groups, happens
1114 * when we are under root and SupplementaryGroups= is empty.
1115 */
1116 if (k == 0) {
1117 *ngids = 0;
1118 return 0;
1119 }
1120
1121 /* Otherwise get the final list of supplementary groups */
1122 groups = memdup(l_gids, sizeof(gid_t) * k);
1123 if (!groups)
1124 return -ENOMEM;
1125
1126 *supplementary_gids = groups;
1127 *ngids = k;
1128
1129 groups = NULL;
1130
1131 return 0;
1132 }
1133
1134 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1135 int r;
1136
1137 /* Handle SupplementaryGroups= if it is not empty */
1138 if (ngids > 0) {
1139 r = maybe_setgroups(ngids, supplementary_gids);
1140 if (r < 0)
1141 return r;
1142 }
1143
1144 if (gid_is_valid(gid)) {
1145 /* Then set our gids */
1146 if (setresgid(gid, gid, gid) < 0)
1147 return -errno;
1148 }
1149
1150 return 0;
1151 }
1152
1153 static int set_securebits(unsigned bits, unsigned mask) {
1154 unsigned applied;
1155 int current;
1156
1157 current = prctl(PR_GET_SECUREBITS);
1158 if (current < 0)
1159 return -errno;
1160
1161 /* Clear all securebits defined in mask and set bits */
1162 applied = ((unsigned) current & ~mask) | bits;
1163 if ((unsigned) current == applied)
1164 return 0;
1165
1166 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1167 return -errno;
1168
1169 return 1;
1170 }
1171
1172 static int enforce_user(
1173 const ExecContext *context,
1174 uid_t uid,
1175 uint64_t capability_ambient_set) {
1176 assert(context);
1177 int r;
1178
1179 if (!uid_is_valid(uid))
1180 return 0;
1181
1182 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1183 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1184 * case. */
1185
1186 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1187
1188 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1189 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1190 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1191 if (r < 0)
1192 return r;
1193 }
1194
1195 /* Second step: actually set the uids */
1196 if (setresuid(uid, uid, uid) < 0)
1197 return -errno;
1198
1199 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1200 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1201 * outside of this call. */
1202 return 0;
1203 }
1204
1205 #if HAVE_PAM
1206
1207 static int null_conv(
1208 int num_msg,
1209 const struct pam_message **msg,
1210 struct pam_response **resp,
1211 void *appdata_ptr) {
1212
1213 /* We don't support conversations */
1214
1215 return PAM_CONV_ERR;
1216 }
1217
1218 #endif
1219
1220 static int setup_pam(
1221 const char *name,
1222 const char *user,
1223 uid_t uid,
1224 gid_t gid,
1225 const char *tty,
1226 char ***env, /* updated on success */
1227 const int fds[], size_t n_fds) {
1228
1229 #if HAVE_PAM
1230
1231 static const struct pam_conv conv = {
1232 .conv = null_conv,
1233 .appdata_ptr = NULL
1234 };
1235
1236 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1237 _cleanup_strv_free_ char **e = NULL;
1238 pam_handle_t *handle = NULL;
1239 sigset_t old_ss;
1240 int pam_code = PAM_SUCCESS, r;
1241 bool close_session = false;
1242 pid_t pam_pid = 0, parent_pid;
1243 int flags = 0;
1244
1245 assert(name);
1246 assert(user);
1247 assert(env);
1248
1249 /* We set up PAM in the parent process, then fork. The child
1250 * will then stay around until killed via PR_GET_PDEATHSIG or
1251 * systemd via the cgroup logic. It will then remove the PAM
1252 * session again. The parent process will exec() the actual
1253 * daemon. We do things this way to ensure that the main PID
1254 * of the daemon is the one we initially fork()ed. */
1255
1256 r = barrier_create(&barrier);
1257 if (r < 0)
1258 goto fail;
1259
1260 if (log_get_max_level() < LOG_DEBUG)
1261 flags |= PAM_SILENT;
1262
1263 pam_code = pam_start(name, user, &conv, &handle);
1264 if (pam_code != PAM_SUCCESS) {
1265 handle = NULL;
1266 goto fail;
1267 }
1268
1269 if (!tty) {
1270 _cleanup_free_ char *q = NULL;
1271
1272 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1273 * out if that's the case, and read the TTY off it. */
1274
1275 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1276 tty = strjoina("/dev/", q);
1277 }
1278
1279 if (tty) {
1280 pam_code = pam_set_item(handle, PAM_TTY, tty);
1281 if (pam_code != PAM_SUCCESS)
1282 goto fail;
1283 }
1284
1285 STRV_FOREACH(nv, *env) {
1286 pam_code = pam_putenv(handle, *nv);
1287 if (pam_code != PAM_SUCCESS)
1288 goto fail;
1289 }
1290
1291 pam_code = pam_acct_mgmt(handle, flags);
1292 if (pam_code != PAM_SUCCESS)
1293 goto fail;
1294
1295 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1296 if (pam_code != PAM_SUCCESS)
1297 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1298
1299 pam_code = pam_open_session(handle, flags);
1300 if (pam_code != PAM_SUCCESS)
1301 goto fail;
1302
1303 close_session = true;
1304
1305 e = pam_getenvlist(handle);
1306 if (!e) {
1307 pam_code = PAM_BUF_ERR;
1308 goto fail;
1309 }
1310
1311 /* Block SIGTERM, so that we know that it won't get lost in the child */
1312
1313 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1314
1315 parent_pid = getpid_cached();
1316
1317 r = safe_fork("(sd-pam)", 0, &pam_pid);
1318 if (r < 0)
1319 goto fail;
1320 if (r == 0) {
1321 int sig, ret = EXIT_PAM;
1322
1323 /* The child's job is to reset the PAM session on termination */
1324 barrier_set_role(&barrier, BARRIER_CHILD);
1325
1326 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1327 * those fds are open here that have been opened by PAM. */
1328 (void) close_many(fds, n_fds);
1329
1330 /* Drop privileges - we don't need any to pam_close_session and this will make
1331 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1332 * threads to fail to exit normally */
1333
1334 r = maybe_setgroups(0, NULL);
1335 if (r < 0)
1336 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1337 if (setresgid(gid, gid, gid) < 0)
1338 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1339 if (setresuid(uid, uid, uid) < 0)
1340 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1341
1342 (void) ignore_signals(SIGPIPE);
1343
1344 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1345 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1346 * this way. We rely on the control groups kill logic to do the rest for us. */
1347 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1348 goto child_finish;
1349
1350 /* Tell the parent that our setup is done. This is especially important regarding dropping
1351 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1352 *
1353 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1354 (void) barrier_place(&barrier);
1355
1356 /* Check if our parent process might already have died? */
1357 if (getppid() == parent_pid) {
1358 sigset_t ss;
1359
1360 assert_se(sigemptyset(&ss) >= 0);
1361 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1362
1363 for (;;) {
1364 if (sigwait(&ss, &sig) < 0) {
1365 if (errno == EINTR)
1366 continue;
1367
1368 goto child_finish;
1369 }
1370
1371 assert(sig == SIGTERM);
1372 break;
1373 }
1374 }
1375
1376 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1377 if (pam_code != PAM_SUCCESS)
1378 goto child_finish;
1379
1380 /* If our parent died we'll end the session */
1381 if (getppid() != parent_pid) {
1382 pam_code = pam_close_session(handle, flags);
1383 if (pam_code != PAM_SUCCESS)
1384 goto child_finish;
1385 }
1386
1387 ret = 0;
1388
1389 child_finish:
1390 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1391 * know about this. See pam_end(3) */
1392 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1393 _exit(ret);
1394 }
1395
1396 barrier_set_role(&barrier, BARRIER_PARENT);
1397
1398 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1399 * here. */
1400 handle = NULL;
1401
1402 /* Unblock SIGTERM again in the parent */
1403 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1404
1405 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1406 * this fd around. */
1407 closelog();
1408
1409 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1410 * recover. However, warn loudly if it happens. */
1411 if (!barrier_place_and_sync(&barrier))
1412 log_error("PAM initialization failed");
1413
1414 return strv_free_and_replace(*env, e);
1415
1416 fail:
1417 if (pam_code != PAM_SUCCESS) {
1418 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1419 r = -EPERM; /* PAM errors do not map to errno */
1420 } else
1421 log_error_errno(r, "PAM failed: %m");
1422
1423 if (handle) {
1424 if (close_session)
1425 pam_code = pam_close_session(handle, flags);
1426
1427 (void) pam_end(handle, pam_code | flags);
1428 }
1429
1430 closelog();
1431 return r;
1432 #else
1433 return 0;
1434 #endif
1435 }
1436
1437 static void rename_process_from_path(const char *path) {
1438 _cleanup_free_ char *buf = NULL;
1439 const char *p;
1440
1441 assert(path);
1442
1443 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1444 * /bin/ps */
1445
1446 if (path_extract_filename(path, &buf) < 0) {
1447 rename_process("(...)");
1448 return;
1449 }
1450
1451 size_t l = strlen(buf);
1452 if (l > 8) {
1453 /* The end of the process name is usually more interesting, since the first bit might just be
1454 * "systemd-" */
1455 p = buf + l - 8;
1456 l = 8;
1457 } else
1458 p = buf;
1459
1460 char process_name[11];
1461 process_name[0] = '(';
1462 memcpy(process_name+1, p, l);
1463 process_name[1+l] = ')';
1464 process_name[1+l+1] = 0;
1465
1466 rename_process(process_name);
1467 }
1468
1469 static bool context_has_address_families(const ExecContext *c) {
1470 assert(c);
1471
1472 return c->address_families_allow_list ||
1473 !set_isempty(c->address_families);
1474 }
1475
1476 static bool context_has_syscall_filters(const ExecContext *c) {
1477 assert(c);
1478
1479 return c->syscall_allow_list ||
1480 !hashmap_isempty(c->syscall_filter);
1481 }
1482
1483 static bool context_has_syscall_logs(const ExecContext *c) {
1484 assert(c);
1485
1486 return c->syscall_log_allow_list ||
1487 !hashmap_isempty(c->syscall_log);
1488 }
1489
1490 static bool context_has_no_new_privileges(const ExecContext *c) {
1491 assert(c);
1492
1493 if (c->no_new_privileges)
1494 return true;
1495
1496 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1497 return false;
1498
1499 /* We need NNP if we have any form of seccomp and are unprivileged */
1500 return c->lock_personality ||
1501 c->memory_deny_write_execute ||
1502 c->private_devices ||
1503 c->protect_clock ||
1504 c->protect_hostname ||
1505 c->protect_kernel_tunables ||
1506 c->protect_kernel_modules ||
1507 c->protect_kernel_logs ||
1508 context_has_address_families(c) ||
1509 exec_context_restrict_namespaces_set(c) ||
1510 c->restrict_realtime ||
1511 c->restrict_suid_sgid ||
1512 !set_isempty(c->syscall_archs) ||
1513 context_has_syscall_filters(c) ||
1514 context_has_syscall_logs(c);
1515 }
1516
1517 bool exec_context_has_credentials(const ExecContext *context) {
1518
1519 assert(context);
1520
1521 return !hashmap_isempty(context->set_credentials) ||
1522 !hashmap_isempty(context->load_credentials) ||
1523 !set_isempty(context->import_credentials);
1524 }
1525
1526 #if HAVE_SECCOMP
1527
1528 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1529
1530 if (is_seccomp_available())
1531 return false;
1532
1533 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1534 return true;
1535 }
1536
1537 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1538 uint32_t negative_action, default_action, action;
1539 int r;
1540
1541 assert(u);
1542 assert(c);
1543
1544 if (!context_has_syscall_filters(c))
1545 return 0;
1546
1547 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1548 return 0;
1549
1550 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1551
1552 if (c->syscall_allow_list) {
1553 default_action = negative_action;
1554 action = SCMP_ACT_ALLOW;
1555 } else {
1556 default_action = SCMP_ACT_ALLOW;
1557 action = negative_action;
1558 }
1559
1560 if (needs_ambient_hack) {
1561 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1562 if (r < 0)
1563 return r;
1564 }
1565
1566 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1567 }
1568
1569 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1570 #ifdef SCMP_ACT_LOG
1571 uint32_t default_action, action;
1572 #endif
1573
1574 assert(u);
1575 assert(c);
1576
1577 if (!context_has_syscall_logs(c))
1578 return 0;
1579
1580 #ifdef SCMP_ACT_LOG
1581 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1582 return 0;
1583
1584 if (c->syscall_log_allow_list) {
1585 /* Log nothing but the ones listed */
1586 default_action = SCMP_ACT_ALLOW;
1587 action = SCMP_ACT_LOG;
1588 } else {
1589 /* Log everything but the ones listed */
1590 default_action = SCMP_ACT_LOG;
1591 action = SCMP_ACT_ALLOW;
1592 }
1593
1594 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1595 #else
1596 /* old libseccomp */
1597 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1598 return 0;
1599 #endif
1600 }
1601
1602 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1603 assert(u);
1604 assert(c);
1605
1606 if (set_isempty(c->syscall_archs))
1607 return 0;
1608
1609 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1610 return 0;
1611
1612 return seccomp_restrict_archs(c->syscall_archs);
1613 }
1614
1615 static int apply_address_families(const Unit* u, const ExecContext *c) {
1616 assert(u);
1617 assert(c);
1618
1619 if (!context_has_address_families(c))
1620 return 0;
1621
1622 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1623 return 0;
1624
1625 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1626 }
1627
1628 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1629 int r;
1630
1631 assert(u);
1632 assert(c);
1633
1634 if (!c->memory_deny_write_execute)
1635 return 0;
1636
1637 /* use prctl() if kernel supports it (6.3) */
1638 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1639 if (r == 0) {
1640 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1641 return 0;
1642 }
1643 if (r < 0 && errno != EINVAL)
1644 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1645 /* else use seccomp */
1646 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1647
1648 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1649 return 0;
1650
1651 return seccomp_memory_deny_write_execute();
1652 }
1653
1654 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1655 assert(u);
1656 assert(c);
1657
1658 if (!c->restrict_realtime)
1659 return 0;
1660
1661 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1662 return 0;
1663
1664 return seccomp_restrict_realtime();
1665 }
1666
1667 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1668 assert(u);
1669 assert(c);
1670
1671 if (!c->restrict_suid_sgid)
1672 return 0;
1673
1674 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1675 return 0;
1676
1677 return seccomp_restrict_suid_sgid();
1678 }
1679
1680 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1681 assert(u);
1682 assert(c);
1683
1684 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1685 * let's protect even those systems where this is left on in the kernel. */
1686
1687 if (!c->protect_kernel_tunables)
1688 return 0;
1689
1690 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1691 return 0;
1692
1693 return seccomp_protect_sysctl();
1694 }
1695
1696 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1697 assert(u);
1698 assert(c);
1699
1700 /* Turn off module syscalls on ProtectKernelModules=yes */
1701
1702 if (!c->protect_kernel_modules)
1703 return 0;
1704
1705 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1706 return 0;
1707
1708 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1709 }
1710
1711 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1712 assert(u);
1713 assert(c);
1714
1715 if (!c->protect_kernel_logs)
1716 return 0;
1717
1718 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1719 return 0;
1720
1721 return seccomp_protect_syslog();
1722 }
1723
1724 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1725 assert(u);
1726 assert(c);
1727
1728 if (!c->protect_clock)
1729 return 0;
1730
1731 if (skip_seccomp_unavailable(u, "ProtectClock="))
1732 return 0;
1733
1734 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1735 }
1736
1737 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1738 assert(u);
1739 assert(c);
1740
1741 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1742
1743 if (!c->private_devices)
1744 return 0;
1745
1746 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1747 return 0;
1748
1749 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1750 }
1751
1752 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1753 assert(u);
1754 assert(c);
1755
1756 if (!exec_context_restrict_namespaces_set(c))
1757 return 0;
1758
1759 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1760 return 0;
1761
1762 return seccomp_restrict_namespaces(c->restrict_namespaces);
1763 }
1764
1765 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1766 unsigned long personality;
1767 int r;
1768
1769 assert(u);
1770 assert(c);
1771
1772 if (!c->lock_personality)
1773 return 0;
1774
1775 if (skip_seccomp_unavailable(u, "LockPersonality="))
1776 return 0;
1777
1778 personality = c->personality;
1779
1780 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1781 if (personality == PERSONALITY_INVALID) {
1782
1783 r = opinionated_personality(&personality);
1784 if (r < 0)
1785 return r;
1786 }
1787
1788 return seccomp_lock_personality(personality);
1789 }
1790
1791 #endif
1792
1793 #if HAVE_LIBBPF
1794 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1795 assert(u);
1796 assert(c);
1797
1798 if (!exec_context_restrict_filesystems_set(c))
1799 return 0;
1800
1801 if (!u->manager->restrict_fs) {
1802 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1803 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1804 return 0;
1805 }
1806
1807 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1808 }
1809 #endif
1810
1811 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1812 assert(u);
1813 assert(c);
1814
1815 if (!c->protect_hostname)
1816 return 0;
1817
1818 if (ns_type_supported(NAMESPACE_UTS)) {
1819 if (unshare(CLONE_NEWUTS) < 0) {
1820 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1821 *ret_exit_status = EXIT_NAMESPACE;
1822 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1823 }
1824
1825 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1826 }
1827 } else
1828 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1829
1830 #if HAVE_SECCOMP
1831 int r;
1832
1833 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1834 return 0;
1835
1836 r = seccomp_protect_hostname();
1837 if (r < 0) {
1838 *ret_exit_status = EXIT_SECCOMP;
1839 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1840 }
1841 #endif
1842
1843 return 0;
1844 }
1845
1846 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1847 assert(idle_pipe);
1848
1849 idle_pipe[1] = safe_close(idle_pipe[1]);
1850 idle_pipe[2] = safe_close(idle_pipe[2]);
1851
1852 if (idle_pipe[0] >= 0) {
1853 int r;
1854
1855 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1856
1857 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1858 ssize_t n;
1859
1860 /* Signal systemd that we are bored and want to continue. */
1861 n = write(idle_pipe[3], "x", 1);
1862 if (n > 0)
1863 /* Wait for systemd to react to the signal above. */
1864 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1865 }
1866
1867 idle_pipe[0] = safe_close(idle_pipe[0]);
1868
1869 }
1870
1871 idle_pipe[3] = safe_close(idle_pipe[3]);
1872 }
1873
1874 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1875
1876 static int build_environment(
1877 const Unit *u,
1878 const ExecContext *c,
1879 const ExecParameters *p,
1880 const CGroupContext *cgroup_context,
1881 size_t n_fds,
1882 char **fdnames,
1883 const char *home,
1884 const char *username,
1885 const char *shell,
1886 dev_t journal_stream_dev,
1887 ino_t journal_stream_ino,
1888 const char *memory_pressure_path,
1889 char ***ret) {
1890
1891 _cleanup_strv_free_ char **our_env = NULL;
1892 size_t n_env = 0;
1893 char *x;
1894 int r;
1895
1896 assert(u);
1897 assert(c);
1898 assert(p);
1899 assert(ret);
1900
1901 #define N_ENV_VARS 19
1902 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1903 if (!our_env)
1904 return -ENOMEM;
1905
1906 if (n_fds > 0) {
1907 _cleanup_free_ char *joined = NULL;
1908
1909 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1910 return -ENOMEM;
1911 our_env[n_env++] = x;
1912
1913 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1914 return -ENOMEM;
1915 our_env[n_env++] = x;
1916
1917 joined = strv_join(fdnames, ":");
1918 if (!joined)
1919 return -ENOMEM;
1920
1921 x = strjoin("LISTEN_FDNAMES=", joined);
1922 if (!x)
1923 return -ENOMEM;
1924 our_env[n_env++] = x;
1925 }
1926
1927 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1928 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1929 return -ENOMEM;
1930 our_env[n_env++] = x;
1931
1932 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1933 return -ENOMEM;
1934 our_env[n_env++] = x;
1935 }
1936
1937 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1938 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1939 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1940 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1941 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1942 if (!x)
1943 return -ENOMEM;
1944 our_env[n_env++] = x;
1945 }
1946
1947 if (home) {
1948 x = strjoin("HOME=", home);
1949 if (!x)
1950 return -ENOMEM;
1951
1952 path_simplify(x + 5);
1953 our_env[n_env++] = x;
1954 }
1955
1956 if (username) {
1957 x = strjoin("LOGNAME=", username);
1958 if (!x)
1959 return -ENOMEM;
1960 our_env[n_env++] = x;
1961
1962 x = strjoin("USER=", username);
1963 if (!x)
1964 return -ENOMEM;
1965 our_env[n_env++] = x;
1966 }
1967
1968 if (shell) {
1969 x = strjoin("SHELL=", shell);
1970 if (!x)
1971 return -ENOMEM;
1972
1973 path_simplify(x + 6);
1974 our_env[n_env++] = x;
1975 }
1976
1977 if (!sd_id128_is_null(u->invocation_id)) {
1978 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1979 return -ENOMEM;
1980
1981 our_env[n_env++] = x;
1982 }
1983
1984 if (exec_context_needs_term(c)) {
1985 _cleanup_free_ char *cmdline = NULL;
1986 const char *tty_path, *term = NULL;
1987
1988 tty_path = exec_context_tty_path(c);
1989
1990 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1991 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1992 * container manager passes to PID 1 ends up all the way in the console login shown. */
1993
1994 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1995 term = getenv("TERM");
1996 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1997 _cleanup_free_ char *key = NULL;
1998
1999 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
2000 if (!key)
2001 return -ENOMEM;
2002
2003 r = proc_cmdline_get_key(key, 0, &cmdline);
2004 if (r < 0)
2005 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2006 else if (r > 0)
2007 term = cmdline;
2008 }
2009
2010 if (!term)
2011 term = default_term_for_tty(tty_path);
2012
2013 x = strjoin("TERM=", term);
2014 if (!x)
2015 return -ENOMEM;
2016 our_env[n_env++] = x;
2017 }
2018
2019 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2020 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2021 return -ENOMEM;
2022
2023 our_env[n_env++] = x;
2024 }
2025
2026 if (c->log_namespace) {
2027 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2028 if (!x)
2029 return -ENOMEM;
2030
2031 our_env[n_env++] = x;
2032 }
2033
2034 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2035 _cleanup_free_ char *joined = NULL;
2036 const char *n;
2037
2038 if (!p->prefix[t])
2039 continue;
2040
2041 if (c->directories[t].n_items == 0)
2042 continue;
2043
2044 n = exec_directory_env_name_to_string(t);
2045 if (!n)
2046 continue;
2047
2048 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2049 _cleanup_free_ char *prefixed = NULL;
2050
2051 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2052 if (!prefixed)
2053 return -ENOMEM;
2054
2055 if (!strextend_with_separator(&joined, ":", prefixed))
2056 return -ENOMEM;
2057 }
2058
2059 x = strjoin(n, "=", joined);
2060 if (!x)
2061 return -ENOMEM;
2062
2063 our_env[n_env++] = x;
2064 }
2065
2066 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2067 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2068 if (!x)
2069 return -ENOMEM;
2070
2071 our_env[n_env++] = x;
2072 }
2073
2074 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2075 return -ENOMEM;
2076
2077 our_env[n_env++] = x;
2078
2079 if (memory_pressure_path) {
2080 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2081 if (!x)
2082 return -ENOMEM;
2083
2084 our_env[n_env++] = x;
2085
2086 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2087 _cleanup_free_ char *b = NULL, *e = NULL;
2088
2089 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2090 MEMORY_PRESSURE_DEFAULT_TYPE,
2091 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2092 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2093 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2094 return -ENOMEM;
2095
2096 if (base64mem(b, strlen(b) + 1, &e) < 0)
2097 return -ENOMEM;
2098
2099 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2100 if (!x)
2101 return -ENOMEM;
2102
2103 our_env[n_env++] = x;
2104 }
2105 }
2106
2107 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2108 #undef N_ENV_VARS
2109
2110 *ret = TAKE_PTR(our_env);
2111
2112 return 0;
2113 }
2114
2115 static int build_pass_environment(const ExecContext *c, char ***ret) {
2116 _cleanup_strv_free_ char **pass_env = NULL;
2117 size_t n_env = 0;
2118
2119 STRV_FOREACH(i, c->pass_environment) {
2120 _cleanup_free_ char *x = NULL;
2121 char *v;
2122
2123 v = getenv(*i);
2124 if (!v)
2125 continue;
2126 x = strjoin(*i, "=", v);
2127 if (!x)
2128 return -ENOMEM;
2129
2130 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2131 return -ENOMEM;
2132
2133 pass_env[n_env++] = TAKE_PTR(x);
2134 pass_env[n_env] = NULL;
2135 }
2136
2137 *ret = TAKE_PTR(pass_env);
2138
2139 return 0;
2140 }
2141
2142 bool exec_needs_network_namespace(const ExecContext *context) {
2143 assert(context);
2144
2145 return context->private_network || context->network_namespace_path;
2146 }
2147
2148 static bool exec_needs_ephemeral(const ExecContext *context) {
2149 return (context->root_image || context->root_directory) && context->root_ephemeral;
2150 }
2151
2152 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2153 assert(context);
2154
2155 return context->private_ipc || context->ipc_namespace_path;
2156 }
2157
2158 bool exec_needs_mount_namespace(
2159 const ExecContext *context,
2160 const ExecParameters *params,
2161 const ExecRuntime *runtime) {
2162
2163 assert(context);
2164
2165 if (context->root_image)
2166 return true;
2167
2168 if (!strv_isempty(context->read_write_paths) ||
2169 !strv_isempty(context->read_only_paths) ||
2170 !strv_isempty(context->inaccessible_paths) ||
2171 !strv_isempty(context->exec_paths) ||
2172 !strv_isempty(context->no_exec_paths))
2173 return true;
2174
2175 if (context->n_bind_mounts > 0)
2176 return true;
2177
2178 if (context->n_temporary_filesystems > 0)
2179 return true;
2180
2181 if (context->n_mount_images > 0)
2182 return true;
2183
2184 if (context->n_extension_images > 0)
2185 return true;
2186
2187 if (!strv_isempty(context->extension_directories))
2188 return true;
2189
2190 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2191 return true;
2192
2193 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2194 return true;
2195
2196 if (context->private_devices ||
2197 context->private_mounts > 0 ||
2198 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2199 context->protect_system != PROTECT_SYSTEM_NO ||
2200 context->protect_home != PROTECT_HOME_NO ||
2201 context->protect_kernel_tunables ||
2202 context->protect_kernel_modules ||
2203 context->protect_kernel_logs ||
2204 context->protect_control_groups ||
2205 context->protect_proc != PROTECT_PROC_DEFAULT ||
2206 context->proc_subset != PROC_SUBSET_ALL ||
2207 exec_needs_ipc_namespace(context))
2208 return true;
2209
2210 if (context->root_directory) {
2211 if (exec_context_get_effective_mount_apivfs(context))
2212 return true;
2213
2214 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2215 if (params && !params->prefix[t])
2216 continue;
2217
2218 if (context->directories[t].n_items > 0)
2219 return true;
2220 }
2221 }
2222
2223 if (context->dynamic_user &&
2224 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2225 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2226 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2227 return true;
2228
2229 if (context->log_namespace)
2230 return true;
2231
2232 return false;
2233 }
2234
2235 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2236 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2237 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2238 _cleanup_close_ int unshare_ready_fd = -EBADF;
2239 _cleanup_(sigkill_waitp) pid_t pid = 0;
2240 uint64_t c = 1;
2241 ssize_t n;
2242 int r;
2243
2244 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2245 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2246 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2247 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2248 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2249 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2250 * continues execution normally.
2251 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2252 * does not need CAP_SETUID to write the single line mapping to itself. */
2253
2254 /* Can only set up multiple mappings with CAP_SETUID. */
2255 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2256 r = asprintf(&uid_map,
2257 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2258 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2259 ouid, ouid, uid, uid);
2260 else
2261 r = asprintf(&uid_map,
2262 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2263 ouid, ouid);
2264
2265 if (r < 0)
2266 return -ENOMEM;
2267
2268 /* Can only set up multiple mappings with CAP_SETGID. */
2269 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2270 r = asprintf(&gid_map,
2271 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2272 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2273 ogid, ogid, gid, gid);
2274 else
2275 r = asprintf(&gid_map,
2276 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2277 ogid, ogid);
2278
2279 if (r < 0)
2280 return -ENOMEM;
2281
2282 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2283 * namespace. */
2284 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2285 if (unshare_ready_fd < 0)
2286 return -errno;
2287
2288 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2289 * failed. */
2290 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2291 return -errno;
2292
2293 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2294 if (r < 0)
2295 return r;
2296 if (r == 0) {
2297 _cleanup_close_ int fd = -EBADF;
2298 const char *a;
2299 pid_t ppid;
2300
2301 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2302 * here, after the parent opened its own user namespace. */
2303
2304 ppid = getppid();
2305 errno_pipe[0] = safe_close(errno_pipe[0]);
2306
2307 /* Wait until the parent unshared the user namespace */
2308 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2309 r = -errno;
2310 goto child_fail;
2311 }
2312
2313 /* Disable the setgroups() system call in the child user namespace, for good. */
2314 a = procfs_file_alloca(ppid, "setgroups");
2315 fd = open(a, O_WRONLY|O_CLOEXEC);
2316 if (fd < 0) {
2317 if (errno != ENOENT) {
2318 r = -errno;
2319 goto child_fail;
2320 }
2321
2322 /* If the file is missing the kernel is too old, let's continue anyway. */
2323 } else {
2324 if (write(fd, "deny\n", 5) < 0) {
2325 r = -errno;
2326 goto child_fail;
2327 }
2328
2329 fd = safe_close(fd);
2330 }
2331
2332 /* First write the GID map */
2333 a = procfs_file_alloca(ppid, "gid_map");
2334 fd = open(a, O_WRONLY|O_CLOEXEC);
2335 if (fd < 0) {
2336 r = -errno;
2337 goto child_fail;
2338 }
2339 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2340 r = -errno;
2341 goto child_fail;
2342 }
2343 fd = safe_close(fd);
2344
2345 /* The write the UID map */
2346 a = procfs_file_alloca(ppid, "uid_map");
2347 fd = open(a, O_WRONLY|O_CLOEXEC);
2348 if (fd < 0) {
2349 r = -errno;
2350 goto child_fail;
2351 }
2352 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2353 r = -errno;
2354 goto child_fail;
2355 }
2356
2357 _exit(EXIT_SUCCESS);
2358
2359 child_fail:
2360 (void) write(errno_pipe[1], &r, sizeof(r));
2361 _exit(EXIT_FAILURE);
2362 }
2363
2364 errno_pipe[1] = safe_close(errno_pipe[1]);
2365
2366 if (unshare(CLONE_NEWUSER) < 0)
2367 return -errno;
2368
2369 /* Let the child know that the namespace is ready now */
2370 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2371 return -errno;
2372
2373 /* Try to read an error code from the child */
2374 n = read(errno_pipe[0], &r, sizeof(r));
2375 if (n < 0)
2376 return -errno;
2377 if (n == sizeof(r)) { /* an error code was sent to us */
2378 if (r < 0)
2379 return r;
2380 return -EIO;
2381 }
2382 if (n != 0) /* on success we should have read 0 bytes */
2383 return -EIO;
2384
2385 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2386 if (r < 0)
2387 return r;
2388 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2389 return -EIO;
2390
2391 return 0;
2392 }
2393
2394 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2395 assert(context);
2396
2397 if (!context->dynamic_user)
2398 return false;
2399
2400 if (type == EXEC_DIRECTORY_CONFIGURATION)
2401 return false;
2402
2403 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2404 return false;
2405
2406 return true;
2407 }
2408
2409 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2410 _cleanup_free_ char *src_abs = NULL;
2411 int r;
2412
2413 assert(source);
2414
2415 src_abs = path_join(root, source);
2416 if (!src_abs)
2417 return -ENOMEM;
2418
2419 STRV_FOREACH(dst, symlinks) {
2420 _cleanup_free_ char *dst_abs = NULL;
2421
2422 dst_abs = path_join(root, *dst);
2423 if (!dst_abs)
2424 return -ENOMEM;
2425
2426 r = mkdir_parents_label(dst_abs, 0755);
2427 if (r < 0)
2428 return r;
2429
2430 r = symlink_idempotent(src_abs, dst_abs, true);
2431 if (r < 0)
2432 return r;
2433 }
2434
2435 return 0;
2436 }
2437
2438 static int setup_exec_directory(
2439 Unit *u,
2440 const ExecContext *context,
2441 const ExecParameters *params,
2442 uid_t uid,
2443 gid_t gid,
2444 ExecDirectoryType type,
2445 bool needs_mount_namespace,
2446 int *exit_status) {
2447
2448 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2449 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2450 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2451 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2452 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2453 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2454 };
2455 int r;
2456
2457 assert(context);
2458 assert(params);
2459 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2460 assert(exit_status);
2461
2462 if (!params->prefix[type])
2463 return 0;
2464
2465 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2466 if (!uid_is_valid(uid))
2467 uid = 0;
2468 if (!gid_is_valid(gid))
2469 gid = 0;
2470 }
2471
2472 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2473 _cleanup_free_ char *p = NULL, *pp = NULL;
2474
2475 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2476 if (!p) {
2477 r = -ENOMEM;
2478 goto fail;
2479 }
2480
2481 r = mkdir_parents_label(p, 0755);
2482 if (r < 0)
2483 goto fail;
2484
2485 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2486
2487 /* If we are in user mode, and a configuration directory exists but a state directory
2488 * doesn't exist, then we likely are upgrading from an older systemd version that
2489 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2490 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2491 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2492 * separated. If a service has both dirs configured but only the configuration dir
2493 * exists and the state dir does not, we assume we are looking at an update
2494 * situation. Hence, create a compatibility symlink, so that all expectations are
2495 * met.
2496 *
2497 * (We also do something similar with the log directory, which still doesn't exist in
2498 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2499
2500 /* this assumes the state dir is always created before the configuration dir */
2501 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2502 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2503
2504 r = laccess(p, F_OK);
2505 if (r == -ENOENT) {
2506 _cleanup_free_ char *q = NULL;
2507
2508 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2509 * under the configuration hierarchy. */
2510
2511 if (type == EXEC_DIRECTORY_STATE)
2512 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2513 else if (type == EXEC_DIRECTORY_LOGS)
2514 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2515 else
2516 assert_not_reached();
2517 if (!q) {
2518 r = -ENOMEM;
2519 goto fail;
2520 }
2521
2522 r = laccess(q, F_OK);
2523 if (r >= 0) {
2524 /* It does exist! This hence looks like an update. Symlink the
2525 * configuration directory into the state directory. */
2526
2527 r = symlink_idempotent(q, p, /* make_relative= */ true);
2528 if (r < 0)
2529 goto fail;
2530
2531 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2532 continue;
2533 } else if (r != -ENOENT)
2534 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2535
2536 } else if (r < 0)
2537 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2538 }
2539
2540 if (exec_directory_is_private(context, type)) {
2541 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2542 * case we want to avoid leaving a directory around fully accessible that is owned by
2543 * a dynamic user whose UID is later on reused. To lock this down we use the same
2544 * trick used by container managers to prohibit host users to get access to files of
2545 * the same UID in containers: we place everything inside a directory that has an
2546 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2547 * for unprivileged host code. We then use fs namespacing to make this directory
2548 * permeable for the service itself.
2549 *
2550 * Specifically: for a service which wants a special directory "foo/" we first create
2551 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2552 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2553 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2554 * unprivileged host users can't look into it. Inside of the namespace of the unit
2555 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2556 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2557 * for the service and making sure it only gets access to the dirs it needs but no
2558 * others. Tricky? Yes, absolutely, but it works!
2559 *
2560 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2561 * to be owned by the service itself.
2562 *
2563 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2564 * for sharing files or sockets with other services. */
2565
2566 pp = path_join(params->prefix[type], "private");
2567 if (!pp) {
2568 r = -ENOMEM;
2569 goto fail;
2570 }
2571
2572 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2573 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2574 if (r < 0)
2575 goto fail;
2576
2577 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2578 r = -ENOMEM;
2579 goto fail;
2580 }
2581
2582 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2583 r = mkdir_parents_label(pp, 0755);
2584 if (r < 0)
2585 goto fail;
2586
2587 if (is_dir(p, false) > 0 &&
2588 (laccess(pp, F_OK) == -ENOENT)) {
2589
2590 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2591 * it over. Most likely the service has been upgraded from one that didn't use
2592 * DynamicUser=1, to one that does. */
2593
2594 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2595 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2596 exec_directory_type_to_string(type), p, pp);
2597
2598 r = RET_NERRNO(rename(p, pp));
2599 if (r < 0)
2600 goto fail;
2601 } else {
2602 /* Otherwise, create the actual directory for the service */
2603
2604 r = mkdir_label(pp, context->directories[type].mode);
2605 if (r < 0 && r != -EEXIST)
2606 goto fail;
2607 }
2608
2609 if (!context->directories[type].items[i].only_create) {
2610 /* And link it up from the original place.
2611 * Notes
2612 * 1) If a mount namespace is going to be used, then this symlink remains on
2613 * the host, and a new one for the child namespace will be created later.
2614 * 2) It is not necessary to create this symlink when one of its parent
2615 * directories is specified and already created. E.g.
2616 * StateDirectory=foo foo/bar
2617 * In that case, the inode points to pp and p for "foo/bar" are the same:
2618 * pp = "/var/lib/private/foo/bar"
2619 * p = "/var/lib/foo/bar"
2620 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2621 * we do not need to create the symlink, but we cannot create the symlink.
2622 * See issue #24783. */
2623 r = symlink_idempotent(pp, p, true);
2624 if (r < 0)
2625 goto fail;
2626 }
2627
2628 } else {
2629 _cleanup_free_ char *target = NULL;
2630
2631 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2632 readlink_and_make_absolute(p, &target) >= 0) {
2633 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2634
2635 /* This already exists and is a symlink? Interesting. Maybe it's one created
2636 * by DynamicUser=1 (see above)?
2637 *
2638 * We do this for all directory types except for ConfigurationDirectory=,
2639 * since they all support the private/ symlink logic at least in some
2640 * configurations, see above. */
2641
2642 r = chase(target, NULL, 0, &target_resolved, NULL);
2643 if (r < 0)
2644 goto fail;
2645
2646 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2647 if (!q) {
2648 r = -ENOMEM;
2649 goto fail;
2650 }
2651
2652 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2653 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2654 if (r < 0)
2655 goto fail;
2656
2657 if (path_equal(q_resolved, target_resolved)) {
2658
2659 /* Hmm, apparently DynamicUser= was once turned on for this service,
2660 * but is no longer. Let's move the directory back up. */
2661
2662 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2663 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2664 exec_directory_type_to_string(type), q, p);
2665
2666 r = RET_NERRNO(unlink(p));
2667 if (r < 0)
2668 goto fail;
2669
2670 r = RET_NERRNO(rename(q, p));
2671 if (r < 0)
2672 goto fail;
2673 }
2674 }
2675
2676 r = mkdir_label(p, context->directories[type].mode);
2677 if (r < 0) {
2678 if (r != -EEXIST)
2679 goto fail;
2680
2681 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2682 struct stat st;
2683
2684 /* Don't change the owner/access mode of the configuration directory,
2685 * as in the common case it is not written to by a service, and shall
2686 * not be writable. */
2687
2688 r = RET_NERRNO(stat(p, &st));
2689 if (r < 0)
2690 goto fail;
2691
2692 /* Still complain if the access mode doesn't match */
2693 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2694 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2695 "(File system: %o %sMode: %o)",
2696 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2697 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2698
2699 continue;
2700 }
2701 }
2702 }
2703
2704 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2705 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2706 * current UID/GID ownership.) */
2707 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2708 if (r < 0)
2709 goto fail;
2710
2711 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2712 * available to user code anyway */
2713 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2714 continue;
2715
2716 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2717 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2718 * assignments to exist. */
2719 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2720 if (r < 0)
2721 goto fail;
2722 }
2723
2724 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2725 * they are set up later, to allow configuring empty var/run/etc. */
2726 if (!needs_mount_namespace)
2727 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2728 r = create_many_symlinks(params->prefix[type],
2729 context->directories[type].items[i].path,
2730 context->directories[type].items[i].symlinks);
2731 if (r < 0)
2732 goto fail;
2733 }
2734
2735 return 0;
2736
2737 fail:
2738 *exit_status = exit_status_table[type];
2739 return r;
2740 }
2741
2742 static int write_credential(
2743 int dfd,
2744 const char *id,
2745 const void *data,
2746 size_t size,
2747 uid_t uid,
2748 bool ownership_ok) {
2749
2750 _cleanup_(unlink_and_freep) char *tmp = NULL;
2751 _cleanup_close_ int fd = -EBADF;
2752 int r;
2753
2754 r = tempfn_random_child("", "cred", &tmp);
2755 if (r < 0)
2756 return r;
2757
2758 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2759 if (fd < 0) {
2760 tmp = mfree(tmp);
2761 return -errno;
2762 }
2763
2764 r = loop_write(fd, data, size, /* do_poll = */ false);
2765 if (r < 0)
2766 return r;
2767
2768 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2769 return -errno;
2770
2771 if (uid_is_valid(uid) && uid != getuid()) {
2772 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2773 if (r < 0) {
2774 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2775 return r;
2776
2777 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2778 * to express: that the user gets read access and nothing
2779 * else. But if the backing fs can't support that (e.g. ramfs)
2780 * then we can use file ownership instead. But that's only safe if
2781 * we can then re-mount the whole thing read-only, so that the
2782 * user can no longer chmod() the file to gain write access. */
2783 return r;
2784
2785 if (fchown(fd, uid, GID_INVALID) < 0)
2786 return -errno;
2787 }
2788 }
2789
2790 if (renameat(dfd, tmp, dfd, id) < 0)
2791 return -errno;
2792
2793 tmp = mfree(tmp);
2794 return 0;
2795 }
2796
2797 typedef enum CredentialSearchPath {
2798 CREDENTIAL_SEARCH_PATH_TRUSTED,
2799 CREDENTIAL_SEARCH_PATH_ENCRYPTED,
2800 CREDENTIAL_SEARCH_PATH_ALL,
2801 _CREDENTIAL_SEARCH_PATH_MAX,
2802 _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
2803 } CredentialSearchPath;
2804
2805 static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
2806
2807 _cleanup_strv_free_ char **l = NULL;
2808
2809 assert(params);
2810 assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
2811
2812 /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2813 * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2814 * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2815
2816 if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2817 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2818 return NULL;
2819
2820 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2821 return NULL;
2822 }
2823
2824 if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2825 if (params->received_credentials_directory)
2826 if (strv_extend(&l, params->received_credentials_directory) < 0)
2827 return NULL;
2828
2829 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2830 return NULL;
2831 }
2832
2833 if (DEBUG_LOGGING) {
2834 _cleanup_free_ char *t = strv_join(l, ":");
2835
2836 log_debug("Credential search path is: %s", strempty(t));
2837 }
2838
2839 return TAKE_PTR(l);
2840 }
2841
2842 static int maybe_decrypt_and_write_credential(
2843 int dir_fd,
2844 const char *id,
2845 bool encrypted,
2846 uid_t uid,
2847 bool ownership_ok,
2848 const char *data,
2849 size_t size,
2850 uint64_t *left) {
2851
2852 _cleanup_free_ void *plaintext = NULL;
2853 size_t add;
2854 int r;
2855
2856 if (encrypted) {
2857 size_t plaintext_size = 0;
2858
2859 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
2860 &plaintext, &plaintext_size);
2861 if (r < 0)
2862 return r;
2863
2864 data = plaintext;
2865 size = plaintext_size;
2866 }
2867
2868 add = strlen(id) + size;
2869 if (add > *left)
2870 return -E2BIG;
2871
2872 r = write_credential(dir_fd, id, data, size, uid, ownership_ok);
2873 if (r < 0)
2874 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2875
2876 *left -= add;
2877 return 0;
2878 }
2879
2880 static int load_credential_glob(
2881 const char *path,
2882 bool encrypted,
2883 char **search_path,
2884 ReadFullFileFlags flags,
2885 int write_dfd,
2886 uid_t uid,
2887 bool ownership_ok,
2888 uint64_t *left) {
2889
2890 int r;
2891
2892 STRV_FOREACH(d, search_path) {
2893 _cleanup_globfree_ glob_t pglob = {};
2894 _cleanup_free_ char *j = NULL;
2895
2896 j = path_join(*d, path);
2897 if (!j)
2898 return -ENOMEM;
2899
2900 r = safe_glob(j, 0, &pglob);
2901 if (r == -ENOENT)
2902 continue;
2903 if (r < 0)
2904 return r;
2905
2906 for (size_t n = 0; n < pglob.gl_pathc; n++) {
2907 _cleanup_free_ char *fn = NULL;
2908 _cleanup_(erase_and_freep) char *data = NULL;
2909 size_t size;
2910
2911 /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2912 r = read_full_file_full(
2913 AT_FDCWD,
2914 pglob.gl_pathv[n],
2915 UINT64_MAX,
2916 encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2917 flags,
2918 NULL,
2919 &data, &size);
2920 if (r < 0)
2921 return log_debug_errno(r, "Failed to read credential '%s': %m",
2922 pglob.gl_pathv[n]);
2923
2924 r = path_extract_filename(pglob.gl_pathv[n], &fn);
2925 if (r < 0)
2926 return log_debug_errno(r, "Failed to extract filename from '%s': %m",
2927 pglob.gl_pathv[n]);
2928
2929 r = maybe_decrypt_and_write_credential(
2930 write_dfd,
2931 fn,
2932 encrypted,
2933 uid,
2934 ownership_ok,
2935 data, size,
2936 left);
2937 if (r == -EEXIST)
2938 continue;
2939 if (r < 0)
2940 return r;
2941 }
2942 }
2943
2944 return 0;
2945 }
2946
2947 static int load_credential(
2948 const ExecContext *context,
2949 const ExecParameters *params,
2950 const char *id,
2951 const char *path,
2952 bool encrypted,
2953 const char *unit,
2954 int read_dfd,
2955 int write_dfd,
2956 uid_t uid,
2957 bool ownership_ok,
2958 uint64_t *left) {
2959
2960 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2961 _cleanup_strv_free_ char **search_path = NULL;
2962 _cleanup_(erase_and_freep) char *data = NULL;
2963 _cleanup_free_ char *bindname = NULL;
2964 const char *source = NULL;
2965 bool missing_ok = true;
2966 size_t size, maxsz;
2967 int r;
2968
2969 assert(context);
2970 assert(params);
2971 assert(id);
2972 assert(path);
2973 assert(unit);
2974 assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2975 assert(write_dfd >= 0);
2976 assert(left);
2977
2978 if (read_dfd >= 0) {
2979 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2980 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2981 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2982 * open it. */
2983
2984 if (!filename_is_valid(path)) /* safety check */
2985 return -EINVAL;
2986
2987 missing_ok = true;
2988 source = path;
2989
2990 } else if (path_is_absolute(path)) {
2991 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2992 * sockets */
2993
2994 if (!path_is_valid(path)) /* safety check */
2995 return -EINVAL;
2996
2997 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2998
2999 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3000 * via the source socket address in case we read off an AF_UNIX socket. */
3001 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3002 return -ENOMEM;
3003
3004 missing_ok = false;
3005 source = path;
3006
3007 } else if (credential_name_valid(path)) {
3008 /* If this is a relative path, take it as credential name relative to the credentials
3009 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3010 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3011
3012 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
3013 if (!search_path)
3014 return -ENOMEM;
3015
3016 missing_ok = true;
3017 } else
3018 source = NULL;
3019
3020 if (encrypted)
3021 flags |= READ_FULL_FILE_UNBASE64;
3022
3023 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
3024
3025 if (search_path) {
3026 STRV_FOREACH(d, search_path) {
3027 _cleanup_free_ char *j = NULL;
3028
3029 j = path_join(*d, path);
3030 if (!j)
3031 return -ENOMEM;
3032
3033 r = read_full_file_full(
3034 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3035 UINT64_MAX,
3036 maxsz,
3037 flags,
3038 NULL,
3039 &data, &size);
3040 if (r != -ENOENT)
3041 break;
3042 }
3043 } else if (source)
3044 r = read_full_file_full(
3045 read_dfd, source,
3046 UINT64_MAX,
3047 maxsz,
3048 flags,
3049 bindname,
3050 &data, &size);
3051 else
3052 r = -ENOENT;
3053
3054 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3055 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3056 * will get clear errors if we don't pass such a missing credential on as they
3057 * themselves will get ENOENT when trying to read them, which should not be much
3058 * worse than when we handle the error here and make it fatal.
3059 *
3060 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3061 * we are fine, too. */
3062 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3063 return 0;
3064 }
3065 if (r < 0)
3066 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3067
3068 return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, ownership_ok, data, size, left);
3069 }
3070
3071 struct load_cred_args {
3072 const ExecContext *context;
3073 const ExecParameters *params;
3074 bool encrypted;
3075 const char *unit;
3076 int dfd;
3077 uid_t uid;
3078 bool ownership_ok;
3079 uint64_t *left;
3080 };
3081
3082 static int load_cred_recurse_dir_cb(
3083 RecurseDirEvent event,
3084 const char *path,
3085 int dir_fd,
3086 int inode_fd,
3087 const struct dirent *de,
3088 const struct statx *sx,
3089 void *userdata) {
3090
3091 struct load_cred_args *args = ASSERT_PTR(userdata);
3092 _cleanup_free_ char *sub_id = NULL;
3093 int r;
3094
3095 if (event != RECURSE_DIR_ENTRY)
3096 return RECURSE_DIR_CONTINUE;
3097
3098 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
3099 return RECURSE_DIR_CONTINUE;
3100
3101 sub_id = strreplace(path, "/", "_");
3102 if (!sub_id)
3103 return -ENOMEM;
3104
3105 if (!credential_name_valid(sub_id))
3106 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3107
3108 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3109 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
3110 return RECURSE_DIR_CONTINUE;
3111 }
3112 if (errno != ENOENT)
3113 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3114
3115 r = load_credential(
3116 args->context,
3117 args->params,
3118 sub_id,
3119 de->d_name,
3120 args->encrypted,
3121 args->unit,
3122 dir_fd,
3123 args->dfd,
3124 args->uid,
3125 args->ownership_ok,
3126 args->left);
3127 if (r < 0)
3128 return r;
3129
3130 return RECURSE_DIR_CONTINUE;
3131 }
3132
3133 static int acquire_credentials(
3134 const ExecContext *context,
3135 const ExecParameters *params,
3136 const char *unit,
3137 const char *p,
3138 uid_t uid,
3139 bool ownership_ok) {
3140
3141 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
3142 _cleanup_close_ int dfd = -EBADF;
3143 const char *ic;
3144 ExecLoadCredential *lc;
3145 ExecSetCredential *sc;
3146 int r;
3147
3148 assert(context);
3149 assert(p);
3150
3151 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
3152 if (dfd < 0)
3153 return -errno;
3154
3155 r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3156 if (r < 0)
3157 return r;
3158
3159 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3160 HASHMAP_FOREACH(lc, context->load_credentials) {
3161 _cleanup_close_ int sub_fd = -EBADF;
3162
3163 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3164 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3165 * a regular file. Finally, if it's a relative path we will use it as a credential name to
3166 * propagate a credential passed to us from further up. */
3167
3168 if (path_is_absolute(lc->path)) {
3169 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
3170 if (sub_fd < 0 && !IN_SET(errno,
3171 ENOTDIR, /* Not a directory */
3172 ENOENT)) /* Doesn't exist? */
3173 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
3174 }
3175
3176 if (sub_fd < 0)
3177 /* Regular file (incl. a credential passed in from higher up) */
3178 r = load_credential(
3179 context,
3180 params,
3181 lc->id,
3182 lc->path,
3183 lc->encrypted,
3184 unit,
3185 AT_FDCWD,
3186 dfd,
3187 uid,
3188 ownership_ok,
3189 &left);
3190 else
3191 /* Directory */
3192 r = recurse_dir(
3193 sub_fd,
3194 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3195 /* statx_mask= */ 0,
3196 /* n_depth_max= */ UINT_MAX,
3197 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3198 load_cred_recurse_dir_cb,
3199 &(struct load_cred_args) {
3200 .context = context,
3201 .params = params,
3202 .encrypted = lc->encrypted,
3203 .unit = unit,
3204 .dfd = dfd,
3205 .uid = uid,
3206 .ownership_ok = ownership_ok,
3207 .left = &left,
3208 });
3209 if (r < 0)
3210 return r;
3211 }
3212
3213 /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3214 * override any credentials found earlier. */
3215 SET_FOREACH(ic, context->import_credentials) {
3216 _cleanup_free_ char **search_path = NULL;
3217
3218 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
3219 if (!search_path)
3220 return -ENOMEM;
3221
3222 r = load_credential_glob(
3223 ic,
3224 /* encrypted = */ false,
3225 search_path,
3226 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
3227 dfd,
3228 uid,
3229 ownership_ok,
3230 &left);
3231 if (r < 0)
3232 return r;
3233
3234 search_path = strv_free(search_path);
3235 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
3236 if (!search_path)
3237 return -ENOMEM;
3238
3239 r = load_credential_glob(
3240 ic,
3241 /* encrypted = */ true,
3242 search_path,
3243 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
3244 dfd,
3245 uid,
3246 ownership_ok,
3247 &left);
3248 if (r < 0)
3249 return r;
3250 }
3251
3252 /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3253 * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3254 HASHMAP_FOREACH(sc, context->set_credentials) {
3255 _cleanup_(erase_and_freep) void *plaintext = NULL;
3256 const char *data;
3257 size_t size, add;
3258
3259 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3260 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3261 * slow and involved, hence it's nice to be able to skip that if the credential already
3262 * exists anyway. */
3263 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
3264 continue;
3265 if (errno != ENOENT)
3266 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
3267
3268 if (sc->encrypted) {
3269 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
3270 if (r < 0)
3271 return r;
3272
3273 data = plaintext;
3274 } else {
3275 data = sc->data;
3276 size = sc->size;
3277 }
3278
3279 add = strlen(sc->id) + size;
3280 if (add > left)
3281 return -E2BIG;
3282
3283 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
3284 if (r < 0)
3285 return r;
3286
3287 left -= add;
3288 }
3289
3290 r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
3291 if (r < 0)
3292 return r;
3293
3294 /* After we created all keys with the right perms, also make sure the credential store as a whole is
3295 * accessible */
3296
3297 if (uid_is_valid(uid) && uid != getuid()) {
3298 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
3299 if (r < 0) {
3300 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3301 return r;
3302
3303 if (!ownership_ok)
3304 return r;
3305
3306 if (fchown(dfd, uid, GID_INVALID) < 0)
3307 return -errno;
3308 }
3309 }
3310
3311 return 0;
3312 }
3313
3314 static int setup_credentials_internal(
3315 const ExecContext *context,
3316 const ExecParameters *params,
3317 const char *unit,
3318 const char *final, /* This is where the credential store shall eventually end up at */
3319 const char *workspace, /* This is where we can prepare it before moving it to the final place */
3320 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
3321 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3322 uid_t uid) {
3323
3324 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3325 * if we mounted something; false if we definitely can't mount anything */
3326 bool final_mounted;
3327 const char *where;
3328
3329 assert(context);
3330 assert(final);
3331 assert(workspace);
3332
3333 if (reuse_workspace) {
3334 r = path_is_mount_point(workspace, NULL, 0);
3335 if (r < 0)
3336 return r;
3337 if (r > 0)
3338 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3339 else
3340 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3341 } else
3342 workspace_mounted = -1; /* ditto */
3343
3344 r = path_is_mount_point(final, NULL, 0);
3345 if (r < 0)
3346 return r;
3347 if (r > 0) {
3348 /* If the final place already has something mounted, we use that. If the workspace also has
3349 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3350 * different). */
3351 final_mounted = true;
3352
3353 if (workspace_mounted < 0) {
3354 /* If the final place is mounted, but the workspace isn't, then let's bind mount
3355 * the final version to the workspace, and make it writable, so that we can make
3356 * changes */
3357
3358 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3359 if (r < 0)
3360 return r;
3361
3362 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3363 if (r < 0)
3364 return r;
3365
3366 workspace_mounted = true;
3367 }
3368 } else
3369 final_mounted = false;
3370
3371 if (workspace_mounted < 0) {
3372 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3373
3374 r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
3375 if (r < 0) {
3376 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3377 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3378 if (r < 0) {
3379 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3380 return r;
3381
3382 if (must_mount) /* If we it's not OK to use the plain directory
3383 * fallback, propagate all errors too */
3384 return r;
3385
3386 /* If we lack privileges to bind mount stuff, then let's gracefully
3387 * proceed for compat with container envs, and just use the final dir
3388 * as is. */
3389
3390 workspace_mounted = false;
3391 } else {
3392 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3393 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3394 if (r < 0)
3395 return r;
3396
3397 workspace_mounted = true;
3398 }
3399 } else
3400 workspace_mounted = true;
3401 }
3402
3403 assert(!must_mount || workspace_mounted > 0);
3404 where = workspace_mounted ? workspace : final;
3405
3406 (void) label_fix_full(AT_FDCWD, where, final, 0);
3407
3408 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3409 if (r < 0)
3410 return r;
3411
3412 if (workspace_mounted) {
3413 bool install;
3414
3415 /* Determine if we should actually install the prepared mount in the final location by bind
3416 * mounting it there. We do so only if the mount is not established there already, and if the
3417 * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3418 * case we are doing all this in a mount namespace, thus no one else will see that we
3419 * allocated a file system we are getting rid of again here. */
3420 if (final_mounted)
3421 install = false; /* already installed */
3422 else {
3423 r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
3424 if (r < 0)
3425 return r;
3426
3427 install = r == 0; /* install only if non-empty */
3428 }
3429
3430 if (install) {
3431 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3432 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
3433 if (r < 0)
3434 return r;
3435
3436 /* And mount it to the final place, read-only */
3437 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3438 } else
3439 /* Otherwise get rid of it */
3440 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3441 if (r < 0)
3442 return r;
3443 } else {
3444 _cleanup_free_ char *parent = NULL;
3445
3446 /* If we do not have our own mount put used the plain directory fallback, then we need to
3447 * open access to the top-level credential directory and the per-service directory now */
3448
3449 r = path_extract_directory(final, &parent);
3450 if (r < 0)
3451 return r;
3452 if (chmod(parent, 0755) < 0)
3453 return -errno;
3454 }
3455
3456 return 0;
3457 }
3458
3459 static int setup_credentials(
3460 const ExecContext *context,
3461 const ExecParameters *params,
3462 const char *unit,
3463 uid_t uid) {
3464
3465 _cleanup_free_ char *p = NULL, *q = NULL;
3466 int r;
3467
3468 assert(context);
3469 assert(params);
3470
3471 if (!exec_context_has_credentials(context))
3472 return 0;
3473
3474 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3475 return -EINVAL;
3476
3477 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3478 * and the subdir we mount over with a read-only file system readable by the service's user */
3479 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3480 if (!q)
3481 return -ENOMEM;
3482
3483 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3484 if (r < 0 && r != -EEXIST)
3485 return r;
3486
3487 p = path_join(q, unit);
3488 if (!p)
3489 return -ENOMEM;
3490
3491 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3492 if (r < 0 && r != -EEXIST)
3493 return r;
3494
3495 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3496 if (r < 0) {
3497 _cleanup_free_ char *t = NULL, *u = NULL;
3498
3499 /* If this is not a privilege or support issue then propagate the error */
3500 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3501 return r;
3502
3503 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3504 * it into place, so that users can't access half-initialized credential stores. */
3505 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3506 if (!t)
3507 return -ENOMEM;
3508
3509 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3510 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3511 * after it is fully set up */
3512 u = path_join(t, unit);
3513 if (!u)
3514 return -ENOMEM;
3515
3516 FOREACH_STRING(i, t, u) {
3517 r = mkdir_label(i, 0700);
3518 if (r < 0 && r != -EEXIST)
3519 return r;
3520 }
3521
3522 r = setup_credentials_internal(
3523 context,
3524 params,
3525 unit,
3526 p, /* final mount point */
3527 u, /* temporary workspace to overmount */
3528 true, /* reuse the workspace if it is already a mount */
3529 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3530 uid);
3531
3532 (void) rmdir(u); /* remove the workspace again if we can. */
3533
3534 if (r < 0)
3535 return r;
3536
3537 } else if (r == 0) {
3538
3539 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3540 * we can use the same directory for all cases, after turning off propagation. Question
3541 * though is: where do we turn off propagation exactly, and where do we place the workspace
3542 * directory? We need some place that is guaranteed to be a mount point in the host, and
3543 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3544 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3545 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3546 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3547 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3548 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3549 * propagation on the former, and then overmount the latter.
3550 *
3551 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3552 * for this purpose, but there are few other candidates that work equally well for us, and
3553 * given that the we do this in a privately namespaced short-lived single-threaded process
3554 * that no one else sees this should be OK to do. */
3555
3556 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3557 if (r < 0)
3558 goto child_fail;
3559
3560 r = setup_credentials_internal(
3561 context,
3562 params,
3563 unit,
3564 p, /* final mount point */
3565 "/dev/shm", /* temporary workspace to overmount */
3566 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3567 true, /* insist that something is mounted, do not allow fallback to plain directory */
3568 uid);
3569 if (r < 0)
3570 goto child_fail;
3571
3572 _exit(EXIT_SUCCESS);
3573
3574 child_fail:
3575 _exit(EXIT_FAILURE);
3576 }
3577
3578 /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3579 * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3580 * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3581 * seen by users when trying access this inode. */
3582 (void) rmdir(p);
3583 return 0;
3584 }
3585
3586 #if ENABLE_SMACK
3587 static int setup_smack(
3588 const Manager *manager,
3589 const ExecContext *context,
3590 int executable_fd) {
3591 int r;
3592
3593 assert(context);
3594 assert(executable_fd >= 0);
3595
3596 if (context->smack_process_label) {
3597 r = mac_smack_apply_pid(0, context->smack_process_label);
3598 if (r < 0)
3599 return r;
3600 } else if (manager->default_smack_process_label) {
3601 _cleanup_free_ char *exec_label = NULL;
3602
3603 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3604 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3605 return r;
3606
3607 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
3608 if (r < 0)
3609 return r;
3610 }
3611
3612 return 0;
3613 }
3614 #endif
3615
3616 static int compile_bind_mounts(
3617 const ExecContext *context,
3618 const ExecParameters *params,
3619 BindMount **ret_bind_mounts,
3620 size_t *ret_n_bind_mounts,
3621 char ***ret_empty_directories) {
3622
3623 _cleanup_strv_free_ char **empty_directories = NULL;
3624 BindMount *bind_mounts = NULL;
3625 size_t n, h = 0;
3626 int r;
3627
3628 assert(context);
3629 assert(params);
3630 assert(ret_bind_mounts);
3631 assert(ret_n_bind_mounts);
3632 assert(ret_empty_directories);
3633
3634 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
3635
3636 n = context->n_bind_mounts;
3637 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3638 if (!params->prefix[t])
3639 continue;
3640
3641 for (size_t i = 0; i < context->directories[t].n_items; i++)
3642 n += !context->directories[t].items[i].only_create;
3643 }
3644
3645 if (n <= 0) {
3646 *ret_bind_mounts = NULL;
3647 *ret_n_bind_mounts = 0;
3648 *ret_empty_directories = NULL;
3649 return 0;
3650 }
3651
3652 bind_mounts = new(BindMount, n);
3653 if (!bind_mounts)
3654 return -ENOMEM;
3655
3656 for (size_t i = 0; i < context->n_bind_mounts; i++) {
3657 BindMount *item = context->bind_mounts + i;
3658 _cleanup_free_ char *s = NULL, *d = NULL;
3659
3660 s = strdup(item->source);
3661 if (!s)
3662 return -ENOMEM;
3663
3664 d = strdup(item->destination);
3665 if (!d)
3666 return -ENOMEM;
3667
3668 bind_mounts[h++] = (BindMount) {
3669 .source = TAKE_PTR(s),
3670 .destination = TAKE_PTR(d),
3671 .read_only = item->read_only,
3672 .recursive = item->recursive,
3673 .ignore_enoent = item->ignore_enoent,
3674 };
3675 }
3676
3677 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3678 if (!params->prefix[t])
3679 continue;
3680
3681 if (context->directories[t].n_items == 0)
3682 continue;
3683
3684 if (exec_directory_is_private(context, t) &&
3685 !exec_context_with_rootfs(context)) {
3686 char *private_root;
3687
3688 /* So this is for a dynamic user, and we need to make sure the process can access its own
3689 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3690 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3691
3692 private_root = path_join(params->prefix[t], "private");
3693 if (!private_root)
3694 return -ENOMEM;
3695
3696 r = strv_consume(&empty_directories, private_root);
3697 if (r < 0)
3698 return r;
3699 }
3700
3701 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3702 _cleanup_free_ char *s = NULL, *d = NULL;
3703
3704 /* When one of the parent directories is in the list, we cannot create the symlink
3705 * for the child directory. See also the comments in setup_exec_directory(). */
3706 if (context->directories[t].items[i].only_create)
3707 continue;
3708
3709 if (exec_directory_is_private(context, t))
3710 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3711 else
3712 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3713 if (!s)
3714 return -ENOMEM;
3715
3716 if (exec_directory_is_private(context, t) &&
3717 exec_context_with_rootfs(context))
3718 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3719 * directory is not created on the root directory. So, let's bind-mount the directory
3720 * on the 'non-private' place. */
3721 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3722 else
3723 d = strdup(s);
3724 if (!d)
3725 return -ENOMEM;
3726
3727 bind_mounts[h++] = (BindMount) {
3728 .source = TAKE_PTR(s),
3729 .destination = TAKE_PTR(d),
3730 .read_only = false,
3731 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3732 .recursive = true,
3733 .ignore_enoent = false,
3734 };
3735 }
3736 }
3737
3738 assert(h == n);
3739
3740 *ret_bind_mounts = TAKE_PTR(bind_mounts);
3741 *ret_n_bind_mounts = n;
3742 *ret_empty_directories = TAKE_PTR(empty_directories);
3743
3744 return (int) n;
3745 }
3746
3747 /* ret_symlinks will contain a list of pairs src:dest that describes
3748 * the symlinks to create later on. For example, the symlinks needed
3749 * to safely give private directories to DynamicUser=1 users. */
3750 static int compile_symlinks(
3751 const ExecContext *context,
3752 const ExecParameters *params,
3753 char ***ret_symlinks) {
3754
3755 _cleanup_strv_free_ char **symlinks = NULL;
3756 int r;
3757
3758 assert(context);
3759 assert(params);
3760 assert(ret_symlinks);
3761
3762 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3763 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3764 _cleanup_free_ char *private_path = NULL, *path = NULL;
3765
3766 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3767 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3768
3769 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3770 dst_abs = path_join(params->prefix[dt], *symlink);
3771 if (!src_abs || !dst_abs)
3772 return -ENOMEM;
3773
3774 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3775 if (r < 0)
3776 return r;
3777 }
3778
3779 if (!exec_directory_is_private(context, dt) ||
3780 exec_context_with_rootfs(context) ||
3781 context->directories[dt].items[i].only_create)
3782 continue;
3783
3784 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3785 if (!private_path)
3786 return -ENOMEM;
3787
3788 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3789 if (!path)
3790 return -ENOMEM;
3791
3792 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3793 if (r < 0)
3794 return r;
3795 }
3796 }
3797
3798 *ret_symlinks = TAKE_PTR(symlinks);
3799
3800 return 0;
3801 }
3802
3803 static bool insist_on_sandboxing(
3804 const ExecContext *context,
3805 const char *root_dir,
3806 const char *root_image,
3807 const BindMount *bind_mounts,
3808 size_t n_bind_mounts) {
3809
3810 assert(context);
3811 assert(n_bind_mounts == 0 || bind_mounts);
3812
3813 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3814 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3815 * rearrange stuff in a way we cannot ignore gracefully. */
3816
3817 if (context->n_temporary_filesystems > 0)
3818 return true;
3819
3820 if (root_dir || root_image)
3821 return true;
3822
3823 if (context->n_mount_images > 0)
3824 return true;
3825
3826 if (context->dynamic_user)
3827 return true;
3828
3829 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3830 return true;
3831
3832 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3833 * essential. */
3834 for (size_t i = 0; i < n_bind_mounts; i++)
3835 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3836 return true;
3837
3838 if (context->log_namespace)
3839 return true;
3840
3841 return false;
3842 }
3843
3844 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3845 _cleanup_close_ int fd = -EBADF;
3846 int r;
3847
3848 if (!runtime || !runtime->ephemeral_copy)
3849 return 0;
3850
3851 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3852 if (r < 0)
3853 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3854
3855 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3856
3857 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3858 if (fd >= 0)
3859 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3860 return 0;
3861
3862 if (fd != -EAGAIN)
3863 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3864
3865 log_debug("Making ephemeral snapshot of %s to %s",
3866 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3867
3868 if (context->root_image)
3869 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3870 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3871 else
3872 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3873 AT_FDCWD, runtime->ephemeral_copy,
3874 BTRFS_SNAPSHOT_FALLBACK_COPY |
3875 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3876 BTRFS_SNAPSHOT_RECURSIVE |
3877 BTRFS_SNAPSHOT_LOCK_BSD);
3878 if (fd < 0)
3879 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3880 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3881
3882 if (context->root_image) {
3883 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3884 * which tends to not perform well in combination with lots of random writes.
3885 *
3886 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3887 * copy, but we at least want to make the intention clear.
3888 */
3889 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3890 if (r < 0)
3891 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3892 }
3893
3894 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3895 if (r < 0)
3896 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3897
3898 return 1;
3899 }
3900
3901 static int verity_settings_prepare(
3902 VeritySettings *verity,
3903 const char *root_image,
3904 const void *root_hash,
3905 size_t root_hash_size,
3906 const char *root_hash_path,
3907 const void *root_hash_sig,
3908 size_t root_hash_sig_size,
3909 const char *root_hash_sig_path,
3910 const char *verity_data_path) {
3911
3912 int r;
3913
3914 assert(verity);
3915
3916 if (root_hash) {
3917 void *d;
3918
3919 d = memdup(root_hash, root_hash_size);
3920 if (!d)
3921 return -ENOMEM;
3922
3923 free_and_replace(verity->root_hash, d);
3924 verity->root_hash_size = root_hash_size;
3925 verity->designator = PARTITION_ROOT;
3926 }
3927
3928 if (root_hash_sig) {
3929 void *d;
3930
3931 d = memdup(root_hash_sig, root_hash_sig_size);
3932 if (!d)
3933 return -ENOMEM;
3934
3935 free_and_replace(verity->root_hash_sig, d);
3936 verity->root_hash_sig_size = root_hash_sig_size;
3937 verity->designator = PARTITION_ROOT;
3938 }
3939
3940 if (verity_data_path) {
3941 r = free_and_strdup(&verity->data_path, verity_data_path);
3942 if (r < 0)
3943 return r;
3944 }
3945
3946 r = verity_settings_load(
3947 verity,
3948 root_image,
3949 root_hash_path,
3950 root_hash_sig_path);
3951 if (r < 0)
3952 return log_debug_errno(r, "Failed to load root hash: %m");
3953
3954 return 0;
3955 }
3956
3957 static int apply_mount_namespace(
3958 const Unit *u,
3959 ExecCommandFlags command_flags,
3960 const ExecContext *context,
3961 const ExecParameters *params,
3962 ExecRuntime *runtime,
3963 const char *memory_pressure_path,
3964 char **error_path) {
3965
3966 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3967 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3968 **read_write_paths_cleanup = NULL;
3969 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3970 *extension_dir = NULL, *host_os_release = NULL;
3971 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3972 char **read_write_paths;
3973 NamespaceInfo ns_info;
3974 bool needs_sandboxing;
3975 BindMount *bind_mounts = NULL;
3976 size_t n_bind_mounts = 0;
3977 int r;
3978
3979 assert(context);
3980
3981 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3982
3983 if (params->flags & EXEC_APPLY_CHROOT) {
3984 r = setup_ephemeral(context, runtime);
3985 if (r < 0)
3986 return r;
3987
3988 if (context->root_image)
3989 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3990 else
3991 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3992 }
3993
3994 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3995 if (r < 0)
3996 return r;
3997
3998 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3999 r = compile_symlinks(context, params, &symlinks);
4000 if (r < 0)
4001 return r;
4002
4003 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4004 * service will need to write to it in order to start the notifications. */
4005 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
4006 read_write_paths_cleanup = strv_copy(context->read_write_paths);
4007 if (!read_write_paths_cleanup)
4008 return -ENOMEM;
4009
4010 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
4011 if (r < 0)
4012 return r;
4013
4014 read_write_paths = read_write_paths_cleanup;
4015 } else
4016 read_write_paths = context->read_write_paths;
4017
4018 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4019 if (needs_sandboxing) {
4020 /* The runtime struct only contains the parent of the private /tmp,
4021 * which is non-accessible to world users. Inside of it there's a /tmp
4022 * that is sticky, and that's the one we want to use here.
4023 * This does not apply when we are using /run/systemd/empty as fallback. */
4024
4025 if (context->private_tmp && runtime && runtime->shared) {
4026 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
4027 tmp_dir = runtime->shared->tmp_dir;
4028 else if (runtime->shared->tmp_dir)
4029 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
4030
4031 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
4032 var_tmp_dir = runtime->shared->var_tmp_dir;
4033 else if (runtime->shared->var_tmp_dir)
4034 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
4035 }
4036
4037 ns_info = (NamespaceInfo) {
4038 .ignore_protect_paths = false,
4039 .private_dev = context->private_devices,
4040 .protect_control_groups = context->protect_control_groups,
4041 .protect_kernel_tunables = context->protect_kernel_tunables,
4042 .protect_kernel_modules = context->protect_kernel_modules,
4043 .protect_kernel_logs = context->protect_kernel_logs,
4044 .protect_hostname = context->protect_hostname,
4045 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
4046 .protect_home = context->protect_home,
4047 .protect_system = context->protect_system,
4048 .protect_proc = context->protect_proc,
4049 .proc_subset = context->proc_subset,
4050 .private_network = exec_needs_network_namespace(context),
4051 .private_ipc = exec_needs_ipc_namespace(context),
4052 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4053 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
4054 };
4055 } else if (!context->dynamic_user && root_dir)
4056 /*
4057 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4058 * sandbox info, otherwise enforce it, don't ignore protected paths and
4059 * fail if we are enable to apply the sandbox inside the mount namespace.
4060 */
4061 ns_info = (NamespaceInfo) {
4062 .ignore_protect_paths = true,
4063 };
4064 else
4065 ns_info = (NamespaceInfo) {};
4066
4067 if (context->mount_propagation_flag == MS_SHARED)
4068 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4069
4070 if (exec_context_has_credentials(context) &&
4071 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
4072 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4073 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
4074 if (!creds_path)
4075 return -ENOMEM;
4076 }
4077
4078 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
4079 propagate_dir = path_join("/run/systemd/propagate/", u->id);
4080 if (!propagate_dir)
4081 return -ENOMEM;
4082
4083 incoming_dir = strdup("/run/systemd/incoming");
4084 if (!incoming_dir)
4085 return -ENOMEM;
4086
4087 extension_dir = strdup("/run/systemd/unit-extensions");
4088 if (!extension_dir)
4089 return -ENOMEM;
4090
4091 /* If running under a different root filesystem, propagate the host's os-release. We make a
4092 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
4093 if (root_dir || root_image) {
4094 host_os_release = strdup("/run/systemd/propagate/os-release");
4095 if (!host_os_release)
4096 return -ENOMEM;
4097 }
4098 } else {
4099 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
4100
4101 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
4102 return -ENOMEM;
4103
4104 if (root_dir || root_image) {
4105 if (asprintf(&host_os_release, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
4106 return -ENOMEM;
4107 }
4108 }
4109
4110 if (root_image) {
4111 r = verity_settings_prepare(
4112 &verity,
4113 root_image,
4114 context->root_hash, context->root_hash_size, context->root_hash_path,
4115 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
4116 context->root_verity);
4117 if (r < 0)
4118 return r;
4119 }
4120
4121 r = setup_namespace(
4122 root_dir,
4123 root_image,
4124 context->root_image_options,
4125 context->root_image_policy ?: &image_policy_service,
4126 &ns_info,
4127 read_write_paths,
4128 needs_sandboxing ? context->read_only_paths : NULL,
4129 needs_sandboxing ? context->inaccessible_paths : NULL,
4130 needs_sandboxing ? context->exec_paths : NULL,
4131 needs_sandboxing ? context->no_exec_paths : NULL,
4132 empty_directories,
4133 symlinks,
4134 bind_mounts,
4135 n_bind_mounts,
4136 context->temporary_filesystems,
4137 context->n_temporary_filesystems,
4138 context->mount_images,
4139 context->n_mount_images,
4140 context->mount_image_policy ?: &image_policy_service,
4141 tmp_dir,
4142 var_tmp_dir,
4143 creds_path,
4144 context->log_namespace,
4145 context->mount_propagation_flag,
4146 &verity,
4147 context->extension_images,
4148 context->n_extension_images,
4149 context->extension_image_policy ?: &image_policy_sysext,
4150 context->extension_directories,
4151 propagate_dir,
4152 incoming_dir,
4153 extension_dir,
4154 root_dir || root_image ? params->notify_socket : NULL,
4155 host_os_release,
4156 error_path);
4157
4158 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4159 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4160 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4161 * completely different execution environment. */
4162 if (r == -ENOANO) {
4163 if (insist_on_sandboxing(
4164 context,
4165 root_dir, root_image,
4166 bind_mounts,
4167 n_bind_mounts))
4168 return log_unit_debug_errno(u,
4169 SYNTHETIC_ERRNO(EOPNOTSUPP),
4170 "Failed to set up namespace, and refusing to continue since "
4171 "the selected namespacing options alter mount environment non-trivially.\n"
4172 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4173 n_bind_mounts,
4174 context->n_temporary_filesystems,
4175 yes_no(root_dir),
4176 yes_no(root_image),
4177 yes_no(context->dynamic_user));
4178
4179 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4180 return 0;
4181 }
4182
4183 return r;
4184 }
4185
4186 static int apply_working_directory(
4187 const ExecContext *context,
4188 const ExecParameters *params,
4189 ExecRuntime *runtime,
4190 const char *home,
4191 int *exit_status) {
4192
4193 const char *d, *wd;
4194
4195 assert(context);
4196 assert(exit_status);
4197
4198 if (context->working_directory_home) {
4199
4200 if (!home) {
4201 *exit_status = EXIT_CHDIR;
4202 return -ENXIO;
4203 }
4204
4205 wd = home;
4206
4207 } else
4208 wd = empty_to_root(context->working_directory);
4209
4210 if (params->flags & EXEC_APPLY_CHROOT)
4211 d = wd;
4212 else
4213 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
4214
4215 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
4216 *exit_status = EXIT_CHDIR;
4217 return -errno;
4218 }
4219
4220 return 0;
4221 }
4222
4223 static int apply_root_directory(
4224 const ExecContext *context,
4225 const ExecParameters *params,
4226 ExecRuntime *runtime,
4227 const bool needs_mount_ns,
4228 int *exit_status) {
4229
4230 assert(context);
4231 assert(exit_status);
4232
4233 if (params->flags & EXEC_APPLY_CHROOT)
4234 if (!needs_mount_ns && context->root_directory)
4235 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
4236 *exit_status = EXIT_CHROOT;
4237 return -errno;
4238 }
4239
4240 return 0;
4241 }
4242
4243 static int setup_keyring(
4244 const Unit *u,
4245 const ExecContext *context,
4246 const ExecParameters *p,
4247 uid_t uid, gid_t gid) {
4248
4249 key_serial_t keyring;
4250 int r = 0;
4251 uid_t saved_uid;
4252 gid_t saved_gid;
4253
4254 assert(u);
4255 assert(context);
4256 assert(p);
4257
4258 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4259 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4260 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4261 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4262 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4263 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4264
4265 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
4266 return 0;
4267
4268 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4269 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4270 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4271 * & group is just as nasty as acquiring a reference to the user keyring. */
4272
4273 saved_uid = getuid();
4274 saved_gid = getgid();
4275
4276 if (gid_is_valid(gid) && gid != saved_gid) {
4277 if (setregid(gid, -1) < 0)
4278 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
4279 }
4280
4281 if (uid_is_valid(uid) && uid != saved_uid) {
4282 if (setreuid(uid, -1) < 0) {
4283 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
4284 goto out;
4285 }
4286 }
4287
4288 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
4289 if (keyring == -1) {
4290 if (errno == ENOSYS)
4291 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
4292 else if (ERRNO_IS_PRIVILEGE(errno))
4293 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
4294 else if (errno == EDQUOT)
4295 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
4296 else
4297 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
4298
4299 goto out;
4300 }
4301
4302 /* When requested link the user keyring into the session keyring. */
4303 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
4304
4305 if (keyctl(KEYCTL_LINK,
4306 KEY_SPEC_USER_KEYRING,
4307 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
4308 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
4309 goto out;
4310 }
4311 }
4312
4313 /* Restore uid/gid back */
4314 if (uid_is_valid(uid) && uid != saved_uid) {
4315 if (setreuid(saved_uid, -1) < 0) {
4316 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
4317 goto out;
4318 }
4319 }
4320
4321 if (gid_is_valid(gid) && gid != saved_gid) {
4322 if (setregid(saved_gid, -1) < 0)
4323 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
4324 }
4325
4326 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4327 if (!sd_id128_is_null(u->invocation_id)) {
4328 key_serial_t key;
4329
4330 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
4331 if (key == -1)
4332 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
4333 else {
4334 if (keyctl(KEYCTL_SETPERM, key,
4335 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
4336 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
4337 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
4338 }
4339 }
4340
4341 out:
4342 /* Revert back uid & gid for the last time, and exit */
4343 /* no extra logging, as only the first already reported error matters */
4344 if (getuid() != saved_uid)
4345 (void) setreuid(saved_uid, -1);
4346
4347 if (getgid() != saved_gid)
4348 (void) setregid(saved_gid, -1);
4349
4350 return r;
4351 }
4352
4353 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
4354 assert(array);
4355 assert(n);
4356 assert(pair);
4357
4358 if (pair[0] >= 0)
4359 array[(*n)++] = pair[0];
4360 if (pair[1] >= 0)
4361 array[(*n)++] = pair[1];
4362 }
4363
4364 static int close_remaining_fds(
4365 const ExecParameters *params,
4366 const ExecRuntime *runtime,
4367 int user_lookup_fd,
4368 int socket_fd,
4369 const int *fds, size_t n_fds) {
4370
4371 size_t n_dont_close = 0;
4372 int dont_close[n_fds + 14];
4373
4374 assert(params);
4375
4376 if (params->stdin_fd >= 0)
4377 dont_close[n_dont_close++] = params->stdin_fd;
4378 if (params->stdout_fd >= 0)
4379 dont_close[n_dont_close++] = params->stdout_fd;
4380 if (params->stderr_fd >= 0)
4381 dont_close[n_dont_close++] = params->stderr_fd;
4382
4383 if (socket_fd >= 0)
4384 dont_close[n_dont_close++] = socket_fd;
4385 if (n_fds > 0) {
4386 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4387 n_dont_close += n_fds;
4388 }
4389
4390 if (runtime)
4391 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
4392
4393 if (runtime && runtime->shared) {
4394 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
4395 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
4396 }
4397
4398 if (runtime && runtime->dynamic_creds) {
4399 if (runtime->dynamic_creds->user)
4400 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
4401 if (runtime->dynamic_creds->group)
4402 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
4403 }
4404
4405 if (user_lookup_fd >= 0)
4406 dont_close[n_dont_close++] = user_lookup_fd;
4407
4408 return close_all_fds(dont_close, n_dont_close);
4409 }
4410
4411 static int send_user_lookup(
4412 Unit *unit,
4413 int user_lookup_fd,
4414 uid_t uid,
4415 gid_t gid) {
4416
4417 assert(unit);
4418
4419 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4420 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4421 * specified. */
4422
4423 if (user_lookup_fd < 0)
4424 return 0;
4425
4426 if (!uid_is_valid(uid) && !gid_is_valid(gid))
4427 return 0;
4428
4429 if (writev(user_lookup_fd,
4430 (struct iovec[]) {
4431 IOVEC_MAKE(&uid, sizeof(uid)),
4432 IOVEC_MAKE(&gid, sizeof(gid)),
4433 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
4434 return -errno;
4435
4436 return 0;
4437 }
4438
4439 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4440 int r;
4441
4442 assert(c);
4443 assert(home);
4444 assert(buf);
4445
4446 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4447
4448 if (*home)
4449 return 0;
4450
4451 if (!c->working_directory_home)
4452 return 0;
4453
4454 r = get_home_dir(buf);
4455 if (r < 0)
4456 return r;
4457
4458 *home = *buf;
4459 return 1;
4460 }
4461
4462 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4463 _cleanup_strv_free_ char ** list = NULL;
4464 int r;
4465
4466 assert(c);
4467 assert(p);
4468 assert(ret);
4469
4470 assert(c->dynamic_user);
4471
4472 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4473 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4474 * directories. */
4475
4476 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4477 if (t == EXEC_DIRECTORY_CONFIGURATION)
4478 continue;
4479
4480 if (!p->prefix[t])
4481 continue;
4482
4483 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4484 char *e;
4485
4486 if (exec_directory_is_private(c, t))
4487 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4488 else
4489 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4490 if (!e)
4491 return -ENOMEM;
4492
4493 r = strv_consume(&list, e);
4494 if (r < 0)
4495 return r;
4496 }
4497 }
4498
4499 *ret = TAKE_PTR(list);
4500
4501 return 0;
4502 }
4503
4504 static int exec_parameters_get_cgroup_path(
4505 const ExecParameters *params,
4506 const CGroupContext *c,
4507 char **ret) {
4508
4509 const char *subgroup = NULL;
4510 char *p;
4511
4512 assert(params);
4513 assert(ret);
4514
4515 if (!params->cgroup_path)
4516 return -EINVAL;
4517
4518 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4519 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4520 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4521 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4522 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4523 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4524 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4525 * flag, which is only passed for the former statements, not for the latter. */
4526
4527 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
4528 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
4529 subgroup = ".control";
4530 else
4531 subgroup = c->delegate_subgroup;
4532 }
4533
4534 if (subgroup)
4535 p = path_join(params->cgroup_path, subgroup);
4536 else
4537 p = strdup(params->cgroup_path);
4538 if (!p)
4539 return -ENOMEM;
4540
4541 *ret = p;
4542 return !!subgroup;
4543 }
4544
4545 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4546 _cleanup_(cpu_set_reset) CPUSet s = {};
4547 int r;
4548
4549 assert(c);
4550 assert(ret);
4551
4552 if (!c->numa_policy.nodes.set) {
4553 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4554 return 0;
4555 }
4556
4557 r = numa_to_cpu_set(&c->numa_policy, &s);
4558 if (r < 0)
4559 return r;
4560
4561 cpu_set_reset(ret);
4562
4563 return cpu_set_add_all(ret, &s);
4564 }
4565
4566 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4567 assert(c);
4568
4569 return c->cpu_affinity_from_numa;
4570 }
4571
4572 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4573 int r;
4574
4575 assert(fds);
4576 assert(n_fds);
4577 assert(*n_fds < fds_size);
4578 assert(ret_fd);
4579
4580 if (fd < 0) {
4581 *ret_fd = -EBADF;
4582 return 0;
4583 }
4584
4585 if (fd < 3 + (int) *n_fds) {
4586 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4587 * the fds we pass to the process (or which are closed only during execve). */
4588
4589 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4590 if (r < 0)
4591 return -errno;
4592
4593 close_and_replace(fd, r);
4594 }
4595
4596 *ret_fd = fds[*n_fds] = fd;
4597 (*n_fds) ++;
4598 return 1;
4599 }
4600
4601 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4602 union sockaddr_union addr = {
4603 .un.sun_family = AF_UNIX,
4604 };
4605 socklen_t sa_len;
4606 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4607 int r;
4608
4609 assert(u);
4610 assert(of);
4611 assert(ofd >= 0);
4612
4613 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4614 if (r < 0)
4615 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4616
4617 sa_len = r;
4618
4619 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4620 _cleanup_close_ int fd = -EBADF;
4621
4622 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4623 if (fd < 0)
4624 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4625
4626 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4627 if (r == -EPROTOTYPE)
4628 continue;
4629 if (r < 0)
4630 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4631
4632 return TAKE_FD(fd);
4633 }
4634
4635 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4636 }
4637
4638 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4639 struct stat st;
4640 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4641
4642 assert(u);
4643 assert(of);
4644
4645 ofd = open(of->path, O_PATH | O_CLOEXEC);
4646 if (ofd < 0)
4647 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4648
4649 if (fstat(ofd, &st) < 0)
4650 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
4651
4652 if (S_ISSOCK(st.st_mode)) {
4653 fd = connect_unix_harder(u, of, ofd);
4654 if (fd < 0)
4655 return fd;
4656
4657 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4658 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4659 of->path);
4660
4661 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4662 } else {
4663 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4664 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4665 flags |= O_APPEND;
4666 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4667 flags |= O_TRUNC;
4668
4669 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4670 if (fd < 0)
4671 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4672
4673 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4674 }
4675
4676 return TAKE_FD(fd);
4677 }
4678
4679 static int collect_open_file_fds(
4680 Unit *u,
4681 OpenFile* open_files,
4682 int **fds,
4683 char ***fdnames,
4684 size_t *n_fds) {
4685 int r;
4686
4687 assert(u);
4688 assert(fds);
4689 assert(fdnames);
4690 assert(n_fds);
4691
4692 LIST_FOREACH(open_files, of, open_files) {
4693 _cleanup_close_ int fd = -EBADF;
4694
4695 fd = get_open_file_fd(u, of);
4696 if (fd < 0) {
4697 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4698 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4699 continue;
4700 }
4701
4702 return fd;
4703 }
4704
4705 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4706 return -ENOMEM;
4707
4708 r = strv_extend(fdnames, of->fdname);
4709 if (r < 0)
4710 return r;
4711
4712 (*fds)[*n_fds] = TAKE_FD(fd);
4713
4714 (*n_fds)++;
4715 }
4716
4717 return 0;
4718 }
4719
4720 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
4721 assert(unit);
4722 assert(msg);
4723 assert(executable);
4724
4725 if (!DEBUG_LOGGING)
4726 return;
4727
4728 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4729
4730 log_unit_struct(unit, LOG_DEBUG,
4731 "EXECUTABLE=%s", executable,
4732 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
4733 LOG_UNIT_INVOCATION_ID(unit));
4734 }
4735
4736 static bool exec_context_need_unprivileged_private_users(
4737 const ExecContext *context,
4738 const ExecParameters *params) {
4739
4740 assert(context);
4741 assert(params);
4742
4743 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4744 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4745 * (system manager) then we have privileges and don't need this. */
4746 if (params->runtime_scope != RUNTIME_SCOPE_USER)
4747 return false;
4748
4749 return context->private_users ||
4750 context->private_tmp ||
4751 context->private_devices ||
4752 context->private_network ||
4753 context->network_namespace_path ||
4754 context->private_ipc ||
4755 context->ipc_namespace_path ||
4756 context->private_mounts > 0 ||
4757 context->mount_apivfs ||
4758 context->n_bind_mounts > 0 ||
4759 context->n_temporary_filesystems > 0 ||
4760 context->root_directory ||
4761 !strv_isempty(context->extension_directories) ||
4762 context->protect_system != PROTECT_SYSTEM_NO ||
4763 context->protect_home != PROTECT_HOME_NO ||
4764 context->protect_kernel_tunables ||
4765 context->protect_kernel_modules ||
4766 context->protect_kernel_logs ||
4767 context->protect_control_groups ||
4768 context->protect_clock ||
4769 context->protect_hostname ||
4770 !strv_isempty(context->read_write_paths) ||
4771 !strv_isempty(context->read_only_paths) ||
4772 !strv_isempty(context->inaccessible_paths) ||
4773 !strv_isempty(context->exec_paths) ||
4774 !strv_isempty(context->no_exec_paths);
4775 }
4776
4777 static int exec_child(
4778 Unit *unit,
4779 const ExecCommand *command,
4780 const ExecContext *context,
4781 const ExecParameters *params,
4782 ExecRuntime *runtime,
4783 const CGroupContext *cgroup_context,
4784 int socket_fd,
4785 const int named_iofds[static 3],
4786 int *params_fds,
4787 size_t n_socket_fds,
4788 size_t n_storage_fds,
4789 char **files_env,
4790 int user_lookup_fd,
4791 int *exit_status) {
4792
4793 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4794 int r, ngids = 0, exec_fd;
4795 _cleanup_free_ gid_t *supplementary_gids = NULL;
4796 const char *username = NULL, *groupname = NULL;
4797 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4798 const char *home = NULL, *shell = NULL;
4799 char **final_argv = NULL;
4800 dev_t journal_stream_dev = 0;
4801 ino_t journal_stream_ino = 0;
4802 bool userns_set_up = false;
4803 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4804 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4805 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4806 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
4807 #if HAVE_SELINUX
4808 _cleanup_free_ char *mac_selinux_context_net = NULL;
4809 bool use_selinux = false;
4810 #endif
4811 #if ENABLE_SMACK
4812 bool use_smack = false;
4813 #endif
4814 #if HAVE_APPARMOR
4815 bool use_apparmor = false;
4816 #endif
4817 uid_t saved_uid = getuid();
4818 gid_t saved_gid = getgid();
4819 uid_t uid = UID_INVALID;
4820 gid_t gid = GID_INVALID;
4821 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4822 n_keep_fds; /* total number of fds not to close */
4823 int secure_bits;
4824 _cleanup_free_ gid_t *gids_after_pam = NULL;
4825 int ngids_after_pam = 0;
4826 _cleanup_free_ int *fds = NULL;
4827 _cleanup_strv_free_ char **fdnames = NULL;
4828
4829 assert(unit);
4830 assert(command);
4831 assert(context);
4832 assert(params);
4833 assert(exit_status);
4834
4835 /* Explicitly test for CVE-2021-4034 inspired invocations */
4836 assert(command->path);
4837 assert(!strv_isempty(command->argv));
4838
4839 rename_process_from_path(command->path);
4840
4841 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4842 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4843 * both of which will be demoted to SIG_DFL. */
4844 (void) default_signals(SIGNALS_CRASH_HANDLER,
4845 SIGNALS_IGNORE);
4846
4847 if (context->ignore_sigpipe)
4848 (void) ignore_signals(SIGPIPE);
4849
4850 r = reset_signal_mask();
4851 if (r < 0) {
4852 *exit_status = EXIT_SIGNAL_MASK;
4853 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4854 }
4855
4856 if (params->idle_pipe)
4857 do_idle_pipe_dance(params->idle_pipe);
4858
4859 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4860 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4861 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4862 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4863
4864 log_forget_fds();
4865 log_set_open_when_needed(true);
4866 log_settle_target();
4867
4868 /* In case anything used libc syslog(), close this here, too */
4869 closelog();
4870
4871 fds = newdup(int, params_fds, n_fds);
4872 if (!fds) {
4873 *exit_status = EXIT_MEMORY;
4874 return log_oom();
4875 }
4876
4877 fdnames = strv_copy((char**) params->fd_names);
4878 if (!fdnames) {
4879 *exit_status = EXIT_MEMORY;
4880 return log_oom();
4881 }
4882
4883 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4884 if (r < 0) {
4885 *exit_status = EXIT_FDS;
4886 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4887 }
4888
4889 int keep_fds[n_fds + 3];
4890 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4891 n_keep_fds = n_fds;
4892
4893 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4894 if (r < 0) {
4895 *exit_status = EXIT_FDS;
4896 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4897 }
4898
4899 #if HAVE_LIBBPF
4900 if (unit->manager->restrict_fs) {
4901 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4902 if (bpf_map_fd < 0) {
4903 *exit_status = EXIT_FDS;
4904 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4905 }
4906
4907 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4908 if (r < 0) {
4909 *exit_status = EXIT_FDS;
4910 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4911 }
4912 }
4913 #endif
4914
4915 r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4916 if (r < 0) {
4917 *exit_status = EXIT_FDS;
4918 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4919 }
4920
4921 if (!context->same_pgrp &&
4922 setsid() < 0) {
4923 *exit_status = EXIT_SETSID;
4924 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4925 }
4926
4927 exec_context_tty_reset(context, params);
4928
4929 if (unit_shall_confirm_spawn(unit)) {
4930 _cleanup_free_ char *cmdline = NULL;
4931
4932 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4933 if (!cmdline) {
4934 *exit_status = EXIT_MEMORY;
4935 return log_oom();
4936 }
4937
4938 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4939 if (r != CONFIRM_EXECUTE) {
4940 if (r == CONFIRM_PRETEND_SUCCESS) {
4941 *exit_status = EXIT_SUCCESS;
4942 return 0;
4943 }
4944 *exit_status = EXIT_CONFIRM;
4945 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4946 "Execution cancelled by the user");
4947 }
4948 }
4949
4950 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4951 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4952 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4953 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4954 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4955 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4956 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4957 *exit_status = EXIT_MEMORY;
4958 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4959 }
4960
4961 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4962 _cleanup_strv_free_ char **suggested_paths = NULL;
4963
4964 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4965 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4966 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4967 *exit_status = EXIT_USER;
4968 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4969 }
4970
4971 r = compile_suggested_paths(context, params, &suggested_paths);
4972 if (r < 0) {
4973 *exit_status = EXIT_MEMORY;
4974 return log_oom();
4975 }
4976
4977 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4978 if (r < 0) {
4979 *exit_status = EXIT_USER;
4980 if (r == -EILSEQ)
4981 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4982 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4983 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4984 }
4985
4986 if (!uid_is_valid(uid)) {
4987 *exit_status = EXIT_USER;
4988 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4989 }
4990
4991 if (!gid_is_valid(gid)) {
4992 *exit_status = EXIT_USER;
4993 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4994 }
4995
4996 if (runtime->dynamic_creds->user)
4997 username = runtime->dynamic_creds->user->name;
4998
4999 } else {
5000 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
5001 if (r < 0) {
5002 *exit_status = EXIT_USER;
5003 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5004 }
5005
5006 r = get_fixed_group(context, &groupname, &gid);
5007 if (r < 0) {
5008 *exit_status = EXIT_GROUP;
5009 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
5010 }
5011 }
5012
5013 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5014 r = get_supplementary_groups(context, username, groupname, gid,
5015 &supplementary_gids, &ngids);
5016 if (r < 0) {
5017 *exit_status = EXIT_GROUP;
5018 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
5019 }
5020
5021 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
5022 if (r < 0) {
5023 *exit_status = EXIT_USER;
5024 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
5025 }
5026
5027 user_lookup_fd = safe_close(user_lookup_fd);
5028
5029 r = acquire_home(context, uid, &home, &home_buffer);
5030 if (r < 0) {
5031 *exit_status = EXIT_CHDIR;
5032 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
5033 }
5034
5035 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5036 if (socket_fd >= 0)
5037 (void) fd_nonblock(socket_fd, false);
5038
5039 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5040 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5041 if (params->cgroup_path) {
5042 _cleanup_free_ char *p = NULL;
5043
5044 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5045 if (r < 0) {
5046 *exit_status = EXIT_CGROUP;
5047 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5048 }
5049
5050 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
5051 if (r == -EUCLEAN) {
5052 *exit_status = EXIT_CGROUP;
5053 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
5054 "because the cgroup or one of its parents or "
5055 "siblings is in the threaded mode: %m", p);
5056 }
5057 if (r < 0) {
5058 *exit_status = EXIT_CGROUP;
5059 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
5060 }
5061 }
5062
5063 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5064 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5065 if (r < 0) {
5066 *exit_status = EXIT_NETWORK;
5067 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5068 }
5069 }
5070
5071 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5072 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5073 if (r < 0) {
5074 *exit_status = EXIT_NAMESPACE;
5075 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5076 }
5077 }
5078
5079 r = setup_input(context, params, socket_fd, named_iofds);
5080 if (r < 0) {
5081 *exit_status = EXIT_STDIN;
5082 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
5083 }
5084
5085 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5086 if (r < 0) {
5087 *exit_status = EXIT_STDOUT;
5088 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
5089 }
5090
5091 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5092 if (r < 0) {
5093 *exit_status = EXIT_STDERR;
5094 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
5095 }
5096
5097 if (context->oom_score_adjust_set) {
5098 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
5099 * prohibit write access to this file, and we shouldn't trip up over that. */
5100 r = set_oom_score_adjust(context->oom_score_adjust);
5101 if (r < 0) {
5102 if (ERRNO_IS_PRIVILEGE(r))
5103 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5104 else {
5105 *exit_status = EXIT_OOM_ADJUST;
5106 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
5107 }
5108 }
5109 }
5110
5111 if (context->coredump_filter_set) {
5112 r = set_coredump_filter(context->coredump_filter);
5113 if (r < 0) {
5114 if (ERRNO_IS_PRIVILEGE(r))
5115 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5116 else
5117 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5118 }
5119 }
5120
5121 if (context->nice_set) {
5122 r = setpriority_closest(context->nice);
5123 if (r < 0)
5124 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5125 }
5126
5127 if (context->cpu_sched_set) {
5128 struct sched_param param = {
5129 .sched_priority = context->cpu_sched_priority,
5130 };
5131
5132 r = sched_setscheduler(0,
5133 context->cpu_sched_policy |
5134 (context->cpu_sched_reset_on_fork ?
5135 SCHED_RESET_ON_FORK : 0),
5136 &param);
5137 if (r < 0) {
5138 *exit_status = EXIT_SETSCHEDULER;
5139 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
5140 }
5141 }
5142
5143 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5144 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
5145 const CPUSet *cpu_set;
5146
5147 if (context->cpu_affinity_from_numa) {
5148 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5149 if (r < 0) {
5150 *exit_status = EXIT_CPUAFFINITY;
5151 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5152 }
5153
5154 cpu_set = &converted_cpu_set;
5155 } else
5156 cpu_set = &context->cpu_set;
5157
5158 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5159 *exit_status = EXIT_CPUAFFINITY;
5160 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
5161 }
5162 }
5163
5164 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5165 r = apply_numa_policy(&context->numa_policy);
5166 if (r < 0) {
5167 if (ERRNO_IS_NOT_SUPPORTED(r))
5168 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
5169 else {
5170 *exit_status = EXIT_NUMA_POLICY;
5171 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
5172 }
5173 }
5174 }
5175
5176 if (context->ioprio_set)
5177 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5178 *exit_status = EXIT_IOPRIO;
5179 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
5180 }
5181
5182 if (context->timer_slack_nsec != NSEC_INFINITY)
5183 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5184 *exit_status = EXIT_TIMERSLACK;
5185 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
5186 }
5187
5188 if (context->personality != PERSONALITY_INVALID) {
5189 r = safe_personality(context->personality);
5190 if (r < 0) {
5191 *exit_status = EXIT_PERSONALITY;
5192 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
5193 }
5194 }
5195
5196 if (context->utmp_id) {
5197 const char *line = context->tty_path ?
5198 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5199 NULL;
5200 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5201 line,
5202 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
5203 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5204 USER_PROCESS,
5205 username);
5206 }
5207
5208 if (uid_is_valid(uid)) {
5209 r = chown_terminal(STDIN_FILENO, uid);
5210 if (r < 0) {
5211 *exit_status = EXIT_STDIN;
5212 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
5213 }
5214 }
5215
5216 if (params->cgroup_path) {
5217 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5218 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5219 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5220 * touch a single hierarchy too. */
5221
5222 if (params->flags & EXEC_CGROUP_DELEGATE) {
5223 _cleanup_free_ char *p = NULL;
5224
5225 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
5226 if (r < 0) {
5227 *exit_status = EXIT_CGROUP;
5228 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
5229 }
5230
5231 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5232 if (r < 0) {
5233 *exit_status = EXIT_CGROUP;
5234 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5235 }
5236 if (r > 0) {
5237 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
5238 if (r < 0) {
5239 *exit_status = EXIT_CGROUP;
5240 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
5241 }
5242 }
5243 }
5244
5245 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
5246 if (cgroup_context_want_memory_pressure(cgroup_context)) {
5247 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5248 if (r < 0) {
5249 *exit_status = EXIT_MEMORY;
5250 return log_oom();
5251 }
5252
5253 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5254 if (r < 0) {
5255 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5256 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5257 memory_pressure_path = mfree(memory_pressure_path);
5258 }
5259 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
5260 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5261 if (!memory_pressure_path) {
5262 *exit_status = EXIT_MEMORY;
5263 return log_oom();
5264 }
5265 }
5266 }
5267 }
5268
5269 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5270
5271 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5272 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5273 if (r < 0)
5274 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5275 }
5276
5277 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
5278 r = setup_credentials(context, params, unit->id, uid);
5279 if (r < 0) {
5280 *exit_status = EXIT_CREDENTIALS;
5281 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
5282 }
5283 }
5284
5285 r = build_environment(
5286 unit,
5287 context,
5288 params,
5289 cgroup_context,
5290 n_fds,
5291 fdnames,
5292 home,
5293 username,
5294 shell,
5295 journal_stream_dev,
5296 journal_stream_ino,
5297 memory_pressure_path,
5298 &our_env);
5299 if (r < 0) {
5300 *exit_status = EXIT_MEMORY;
5301 return log_oom();
5302 }
5303
5304 r = build_pass_environment(context, &pass_env);
5305 if (r < 0) {
5306 *exit_status = EXIT_MEMORY;
5307 return log_oom();
5308 }
5309
5310 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5311 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5312 * not specify PATH but the unit has ExecSearchPath. */
5313 if (!strv_isempty(context->exec_search_path)) {
5314 _cleanup_free_ char *joined = NULL;
5315
5316 joined = strv_join(context->exec_search_path, ":");
5317 if (!joined) {
5318 *exit_status = EXIT_MEMORY;
5319 return log_oom();
5320 }
5321
5322 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5323 if (r < 0) {
5324 *exit_status = EXIT_MEMORY;
5325 return log_oom();
5326 }
5327 }
5328
5329 accum_env = strv_env_merge(params->environment,
5330 our_env,
5331 joined_exec_search_path,
5332 pass_env,
5333 context->environment,
5334 files_env);
5335 if (!accum_env) {
5336 *exit_status = EXIT_MEMORY;
5337 return log_oom();
5338 }
5339 accum_env = strv_env_clean(accum_env);
5340
5341 (void) umask(context->umask);
5342
5343 r = setup_keyring(unit, context, params, uid, gid);
5344 if (r < 0) {
5345 *exit_status = EXIT_KEYRING;
5346 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
5347 }
5348
5349 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5350 * from it. */
5351 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
5352
5353 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5354 * for it, and the kernel doesn't actually support ambient caps. */
5355 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
5356
5357 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5358 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5359 * desired. */
5360 if (needs_ambient_hack)
5361 needs_setuid = false;
5362 else
5363 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5364
5365 uint64_t capability_ambient_set = context->capability_ambient_set;
5366
5367 if (needs_sandboxing) {
5368 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5369 * /sys being present. The actual MAC context application will happen later, as late as
5370 * possible, to avoid impacting our own code paths. */
5371
5372 #if HAVE_SELINUX
5373 use_selinux = mac_selinux_use();
5374 #endif
5375 #if ENABLE_SMACK
5376 use_smack = mac_smack_use();
5377 #endif
5378 #if HAVE_APPARMOR
5379 use_apparmor = mac_apparmor_use();
5380 #endif
5381 }
5382
5383 if (needs_sandboxing) {
5384 int which_failed;
5385
5386 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5387 * is set here. (See below.) */
5388
5389 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5390 if (r < 0) {
5391 *exit_status = EXIT_LIMITS;
5392 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5393 }
5394 }
5395
5396 if (needs_setuid && context->pam_name && username) {
5397 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5398 * wins here. (See above.) */
5399
5400 /* All fds passed in the fds array will be closed in the pam child process. */
5401 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
5402 if (r < 0) {
5403 *exit_status = EXIT_PAM;
5404 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
5405 }
5406
5407 if (ambient_capabilities_supported()) {
5408 uint64_t ambient_after_pam;
5409
5410 /* PAM modules might have set some ambient caps. Query them here and merge them into
5411 * the caps we want to set in the end, so that we don't end up unsetting them. */
5412 r = capability_get_ambient(&ambient_after_pam);
5413 if (r < 0) {
5414 *exit_status = EXIT_CAPABILITIES;
5415 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
5416 }
5417
5418 capability_ambient_set |= ambient_after_pam;
5419 }
5420
5421 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5422 if (ngids_after_pam < 0) {
5423 *exit_status = EXIT_MEMORY;
5424 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5425 }
5426 }
5427
5428 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5429 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5430 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5431 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5432
5433 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5434 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5435 * the actual requested operations fail (or silently continue). */
5436 if (r < 0 && context->private_users) {
5437 *exit_status = EXIT_USER;
5438 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5439 }
5440 if (r < 0)
5441 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5442 else
5443 userns_set_up = true;
5444 }
5445
5446 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5447
5448 /* Try to enable network namespacing if network namespacing is available and we have
5449 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5450 * new network namespace. And if we don't have that, then we could only create a network
5451 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5452 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
5453 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
5454 if (r < 0) {
5455 if (ERRNO_IS_PRIVILEGE(r))
5456 log_unit_notice_errno(unit, r,
5457 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5458 else {
5459 *exit_status = EXIT_NETWORK;
5460 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
5461 }
5462 }
5463 } else if (context->network_namespace_path) {
5464 *exit_status = EXIT_NETWORK;
5465 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5466 "NetworkNamespacePath= is not supported, refusing.");
5467 } else
5468 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5469 }
5470
5471 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5472
5473 if (ns_type_supported(NAMESPACE_IPC)) {
5474 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
5475 if (r == -EPERM)
5476 log_unit_warning_errno(unit, r,
5477 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5478 else if (r < 0) {
5479 *exit_status = EXIT_NAMESPACE;
5480 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
5481 }
5482 } else if (context->ipc_namespace_path) {
5483 *exit_status = EXIT_NAMESPACE;
5484 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5485 "IPCNamespacePath= is not supported, refusing.");
5486 } else
5487 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5488 }
5489
5490 if (needs_mount_namespace) {
5491 _cleanup_free_ char *error_path = NULL;
5492
5493 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
5494 if (r < 0) {
5495 *exit_status = EXIT_NAMESPACE;
5496 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5497 error_path ? ": " : "", strempty(error_path));
5498 }
5499 }
5500
5501 if (needs_sandboxing) {
5502 r = apply_protect_hostname(unit, context, exit_status);
5503 if (r < 0)
5504 return r;
5505 }
5506
5507 if (context->memory_ksm >= 0)
5508 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
5509 if (ERRNO_IS_NOT_SUPPORTED(errno))
5510 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
5511 else {
5512 *exit_status = EXIT_KSM;
5513 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
5514 }
5515 }
5516
5517 /* Drop groups as early as possible.
5518 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5519 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5520 if (needs_setuid) {
5521 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5522 int ngids_to_enforce = 0;
5523
5524 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5525 ngids,
5526 gids_after_pam,
5527 ngids_after_pam,
5528 &gids_to_enforce);
5529 if (ngids_to_enforce < 0) {
5530 *exit_status = EXIT_MEMORY;
5531 return log_unit_error_errno(unit,
5532 ngids_to_enforce,
5533 "Failed to merge group lists. Group membership might be incorrect: %m");
5534 }
5535
5536 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5537 if (r < 0) {
5538 *exit_status = EXIT_GROUP;
5539 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
5540 }
5541 }
5542
5543 /* If the user namespace was not set up above, try to do it now.
5544 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5545 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5546 * case of mount namespaces being less privileged when the mount point list is copied from a
5547 * different user namespace). */
5548
5549 if (needs_sandboxing && context->private_users && !userns_set_up) {
5550 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5551 if (r < 0) {
5552 *exit_status = EXIT_USER;
5553 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
5554 }
5555 }
5556
5557 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5558 * shall execute. */
5559
5560 _cleanup_free_ char *executable = NULL;
5561 _cleanup_close_ int executable_fd = -EBADF;
5562 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5563 if (r < 0) {
5564 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
5565 log_unit_struct_errno(unit, LOG_INFO, r,
5566 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5567 LOG_UNIT_INVOCATION_ID(unit),
5568 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5569 command->path),
5570 "EXECUTABLE=%s", command->path);
5571 return 0;
5572 }
5573
5574 *exit_status = EXIT_EXEC;
5575
5576 return log_unit_struct_errno(unit, LOG_INFO, r,
5577 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5578 LOG_UNIT_INVOCATION_ID(unit),
5579 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5580 command->path),
5581 "EXECUTABLE=%s", command->path);
5582 }
5583
5584 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5585 if (r < 0) {
5586 *exit_status = EXIT_FDS;
5587 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5588 }
5589
5590 #if HAVE_SELINUX
5591 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5592 int fd = -EBADF;
5593
5594 if (socket_fd >= 0)
5595 fd = socket_fd;
5596 else if (params->n_socket_fds == 1)
5597 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5598 * use context from that fd to compute the label. */
5599 fd = params->fds[0];
5600
5601 if (fd >= 0) {
5602 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5603 if (r < 0) {
5604 if (!context->selinux_context_ignore) {
5605 *exit_status = EXIT_SELINUX_CONTEXT;
5606 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5607 }
5608 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
5609 }
5610 }
5611 }
5612 #endif
5613
5614 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5615 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5616 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5617 * execve(). */
5618
5619 r = close_all_fds(keep_fds, n_keep_fds);
5620 if (r >= 0)
5621 r = shift_fds(fds, n_fds);
5622 if (r >= 0)
5623 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
5624 if (r < 0) {
5625 *exit_status = EXIT_FDS;
5626 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
5627 }
5628
5629 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5630 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5631 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5632 * came this far. */
5633
5634 secure_bits = context->secure_bits;
5635
5636 if (needs_sandboxing) {
5637 uint64_t bset;
5638
5639 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5640 * (Note this is placed after the general resource limit initialization, see above, in order
5641 * to take precedence.) */
5642 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5643 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5644 *exit_status = EXIT_LIMITS;
5645 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5646 }
5647 }
5648
5649 #if ENABLE_SMACK
5650 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5651 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5652 if (use_smack) {
5653 r = setup_smack(unit->manager, context, executable_fd);
5654 if (r < 0 && !context->smack_process_label_ignore) {
5655 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5656 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5657 }
5658 }
5659 #endif
5660
5661 bset = context->capability_bounding_set;
5662 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5663 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5664 * instead of us doing that */
5665 if (needs_ambient_hack)
5666 bset |= (UINT64_C(1) << CAP_SETPCAP) |
5667 (UINT64_C(1) << CAP_SETUID) |
5668 (UINT64_C(1) << CAP_SETGID);
5669
5670 if (!cap_test_all(bset)) {
5671 r = capability_bounding_set_drop(bset, /* right_now= */ false);
5672 if (r < 0) {
5673 *exit_status = EXIT_CAPABILITIES;
5674 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5675 }
5676 }
5677
5678 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5679 * keep-caps set.
5680 *
5681 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5682 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5683 * the ambient capabilities can be raised as they are present in the permitted and
5684 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5685 * without changing the user, so we also set the ambient capabilities here.
5686 *
5687 * The requested ambient capabilities are raised in the inheritable set if the second
5688 * argument is true. */
5689 if (!needs_ambient_hack) {
5690 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5691 if (r < 0) {
5692 *exit_status = EXIT_CAPABILITIES;
5693 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5694 }
5695 }
5696 }
5697
5698 /* chroot to root directory first, before we lose the ability to chroot */
5699 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5700 if (r < 0)
5701 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5702
5703 if (needs_setuid) {
5704 if (uid_is_valid(uid)) {
5705 r = enforce_user(context, uid, capability_ambient_set);
5706 if (r < 0) {
5707 *exit_status = EXIT_USER;
5708 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5709 }
5710
5711 if (!needs_ambient_hack && capability_ambient_set != 0) {
5712
5713 /* Raise the ambient capabilities after user change. */
5714 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5715 if (r < 0) {
5716 *exit_status = EXIT_CAPABILITIES;
5717 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5718 }
5719 }
5720 }
5721 }
5722
5723 /* Apply working directory here, because the working directory might be on NFS and only the user running
5724 * this service might have the correct privilege to change to the working directory */
5725 r = apply_working_directory(context, params, runtime, home, exit_status);
5726 if (r < 0)
5727 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5728
5729 if (needs_sandboxing) {
5730 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5731 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5732 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5733 * are restricted. */
5734
5735 #if HAVE_SELINUX
5736 if (use_selinux) {
5737 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5738
5739 if (exec_context) {
5740 r = setexeccon(exec_context);
5741 if (r < 0) {
5742 if (!context->selinux_context_ignore) {
5743 *exit_status = EXIT_SELINUX_CONTEXT;
5744 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5745 }
5746 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5747 }
5748 }
5749 }
5750 #endif
5751
5752 #if HAVE_APPARMOR
5753 if (use_apparmor && context->apparmor_profile) {
5754 r = aa_change_onexec(context->apparmor_profile);
5755 if (r < 0 && !context->apparmor_profile_ignore) {
5756 *exit_status = EXIT_APPARMOR_PROFILE;
5757 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5758 }
5759 }
5760 #endif
5761
5762 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5763 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5764 * requires CAP_SETPCAP. */
5765 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5766 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5767 * effective set here.
5768 *
5769 * The effective set is overwritten during execve() with the following values:
5770 *
5771 * - ambient set (for non-root processes)
5772 *
5773 * - (inheritable | bounding) set for root processes)
5774 *
5775 * Hence there is no security impact to raise it in the effective set before execve
5776 */
5777 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5778 if (r < 0) {
5779 *exit_status = EXIT_CAPABILITIES;
5780 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5781 }
5782 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5783 *exit_status = EXIT_SECUREBITS;
5784 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5785 }
5786 }
5787
5788 if (context_has_no_new_privileges(context))
5789 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5790 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5791 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5792 }
5793
5794 #if HAVE_SECCOMP
5795 r = apply_address_families(unit, context);
5796 if (r < 0) {
5797 *exit_status = EXIT_ADDRESS_FAMILIES;
5798 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5799 }
5800
5801 r = apply_memory_deny_write_execute(unit, context);
5802 if (r < 0) {
5803 *exit_status = EXIT_SECCOMP;
5804 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5805 }
5806
5807 r = apply_restrict_realtime(unit, context);
5808 if (r < 0) {
5809 *exit_status = EXIT_SECCOMP;
5810 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5811 }
5812
5813 r = apply_restrict_suid_sgid(unit, context);
5814 if (r < 0) {
5815 *exit_status = EXIT_SECCOMP;
5816 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5817 }
5818
5819 r = apply_restrict_namespaces(unit, context);
5820 if (r < 0) {
5821 *exit_status = EXIT_SECCOMP;
5822 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5823 }
5824
5825 r = apply_protect_sysctl(unit, context);
5826 if (r < 0) {
5827 *exit_status = EXIT_SECCOMP;
5828 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5829 }
5830
5831 r = apply_protect_kernel_modules(unit, context);
5832 if (r < 0) {
5833 *exit_status = EXIT_SECCOMP;
5834 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5835 }
5836
5837 r = apply_protect_kernel_logs(unit, context);
5838 if (r < 0) {
5839 *exit_status = EXIT_SECCOMP;
5840 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5841 }
5842
5843 r = apply_protect_clock(unit, context);
5844 if (r < 0) {
5845 *exit_status = EXIT_SECCOMP;
5846 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5847 }
5848
5849 r = apply_private_devices(unit, context);
5850 if (r < 0) {
5851 *exit_status = EXIT_SECCOMP;
5852 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5853 }
5854
5855 r = apply_syscall_archs(unit, context);
5856 if (r < 0) {
5857 *exit_status = EXIT_SECCOMP;
5858 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5859 }
5860
5861 r = apply_lock_personality(unit, context);
5862 if (r < 0) {
5863 *exit_status = EXIT_SECCOMP;
5864 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5865 }
5866
5867 r = apply_syscall_log(unit, context);
5868 if (r < 0) {
5869 *exit_status = EXIT_SECCOMP;
5870 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5871 }
5872
5873 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5874 * by the filter as little as possible. */
5875 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5876 if (r < 0) {
5877 *exit_status = EXIT_SECCOMP;
5878 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5879 }
5880 #endif
5881
5882 #if HAVE_LIBBPF
5883 r = apply_restrict_filesystems(unit, context);
5884 if (r < 0) {
5885 *exit_status = EXIT_BPF;
5886 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5887 }
5888 #endif
5889
5890 }
5891
5892 if (!strv_isempty(context->unset_environment)) {
5893 char **ee = NULL;
5894
5895 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5896 if (!ee) {
5897 *exit_status = EXIT_MEMORY;
5898 return log_oom();
5899 }
5900
5901 strv_free_and_replace(accum_env, ee);
5902 }
5903
5904 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5905 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5906
5907 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5908 if (r < 0) {
5909 *exit_status = EXIT_MEMORY;
5910 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5911 }
5912 final_argv = replaced_argv;
5913
5914 if (!strv_isempty(unset_variables)) {
5915 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5916 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5917 }
5918
5919 if (!strv_isempty(bad_variables)) {
5920 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5921 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5922 }
5923 } else
5924 final_argv = command->argv;
5925
5926 log_command_line(unit, "Executing", executable, final_argv);
5927
5928 if (exec_fd >= 0) {
5929 uint8_t hot = 1;
5930
5931 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5932 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5933
5934 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5935 *exit_status = EXIT_EXEC;
5936 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5937 }
5938 }
5939
5940 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5941
5942 if (exec_fd >= 0) {
5943 uint8_t hot = 0;
5944
5945 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5946 * that POLLHUP on it no longer means execve() succeeded. */
5947
5948 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5949 *exit_status = EXIT_EXEC;
5950 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5951 }
5952 }
5953
5954 *exit_status = EXIT_EXEC;
5955 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5956 }
5957
5958 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5959 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5960
5961 int exec_spawn(Unit *unit,
5962 ExecCommand *command,
5963 const ExecContext *context,
5964 const ExecParameters *params,
5965 ExecRuntime *runtime,
5966 const CGroupContext *cgroup_context,
5967 pid_t *ret) {
5968
5969 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5970 _cleanup_free_ char *subcgroup_path = NULL;
5971 _cleanup_strv_free_ char **files_env = NULL;
5972 size_t n_storage_fds = 0, n_socket_fds = 0;
5973 pid_t pid;
5974
5975 assert(unit);
5976 assert(command);
5977 assert(context);
5978 assert(ret);
5979 assert(params);
5980 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5981
5982 LOG_CONTEXT_PUSH_UNIT(unit);
5983
5984 if (context->std_input == EXEC_INPUT_SOCKET ||
5985 context->std_output == EXEC_OUTPUT_SOCKET ||
5986 context->std_error == EXEC_OUTPUT_SOCKET) {
5987
5988 if (params->n_socket_fds > 1)
5989 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5990
5991 if (params->n_socket_fds == 0)
5992 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5993
5994 socket_fd = params->fds[0];
5995 } else {
5996 socket_fd = -EBADF;
5997 fds = params->fds;
5998 n_socket_fds = params->n_socket_fds;
5999 n_storage_fds = params->n_storage_fds;
6000 }
6001
6002 r = exec_context_named_iofds(context, params, named_iofds);
6003 if (r < 0)
6004 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
6005
6006 r = exec_context_load_environment(unit, context, &files_env);
6007 if (r < 0)
6008 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
6009
6010 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6011 and, until the next SELinux policy changes, we save further reloads in future children. */
6012 mac_selinux_maybe_reload();
6013
6014 /* We won't know the real executable path until we create the mount namespace in the child, but we
6015 want to log from the parent, so we use the possibly inaccurate path here. */
6016 log_command_line(unit, "About to execute", command->path, command->argv);
6017
6018 if (params->cgroup_path) {
6019 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
6020 if (r < 0)
6021 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
6022 if (r > 0) {
6023 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6024 * realized by the unit logic) */
6025
6026 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
6027 if (r < 0)
6028 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
6029 }
6030 }
6031
6032 pid = fork();
6033 if (pid < 0)
6034 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
6035
6036 if (pid == 0) {
6037 int exit_status = EXIT_SUCCESS;
6038
6039 r = exec_child(unit,
6040 command,
6041 context,
6042 params,
6043 runtime,
6044 cgroup_context,
6045 socket_fd,
6046 named_iofds,
6047 fds,
6048 n_socket_fds,
6049 n_storage_fds,
6050 files_env,
6051 unit->manager->user_lookup_fds[1],
6052 &exit_status);
6053
6054 if (r < 0) {
6055 const char *status =
6056 exit_status_to_string(exit_status,
6057 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
6058
6059 log_unit_struct_errno(unit, LOG_ERR, r,
6060 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
6061 LOG_UNIT_INVOCATION_ID(unit),
6062 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
6063 status, command->path),
6064 "EXECUTABLE=%s", command->path);
6065 }
6066
6067 _exit(exit_status);
6068 }
6069
6070 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
6071
6072 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6073 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6074 * process will be killed too). */
6075 if (subcgroup_path)
6076 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
6077
6078 exec_status_start(&command->exec_status, pid);
6079
6080 *ret = pid;
6081 return 0;
6082 }
6083
6084 void exec_context_init(ExecContext *c) {
6085 assert(c);
6086
6087 c->umask = 0022;
6088 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
6089 c->cpu_sched_policy = SCHED_OTHER;
6090 c->syslog_priority = LOG_DAEMON|LOG_INFO;
6091 c->syslog_level_prefix = true;
6092 c->ignore_sigpipe = true;
6093 c->timer_slack_nsec = NSEC_INFINITY;
6094 c->personality = PERSONALITY_INVALID;
6095 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6096 c->directories[t].mode = 0755;
6097 c->timeout_clean_usec = USEC_INFINITY;
6098 c->capability_bounding_set = CAP_MASK_UNSET;
6099 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
6100 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
6101 c->log_level_max = -1;
6102 #if HAVE_SECCOMP
6103 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
6104 #endif
6105 c->tty_rows = UINT_MAX;
6106 c->tty_cols = UINT_MAX;
6107 numa_policy_reset(&c->numa_policy);
6108 c->private_mounts = -1;
6109 c->memory_ksm = -1;
6110 }
6111
6112 void exec_context_done(ExecContext *c) {
6113 assert(c);
6114
6115 c->environment = strv_free(c->environment);
6116 c->environment_files = strv_free(c->environment_files);
6117 c->pass_environment = strv_free(c->pass_environment);
6118 c->unset_environment = strv_free(c->unset_environment);
6119
6120 rlimit_free_all(c->rlimit);
6121
6122 for (size_t l = 0; l < 3; l++) {
6123 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
6124 c->stdio_file[l] = mfree(c->stdio_file[l]);
6125 }
6126
6127 c->working_directory = mfree(c->working_directory);
6128 c->root_directory = mfree(c->root_directory);
6129 c->root_image = mfree(c->root_image);
6130 c->root_image_options = mount_options_free_all(c->root_image_options);
6131 c->root_hash = mfree(c->root_hash);
6132 c->root_hash_size = 0;
6133 c->root_hash_path = mfree(c->root_hash_path);
6134 c->root_hash_sig = mfree(c->root_hash_sig);
6135 c->root_hash_sig_size = 0;
6136 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
6137 c->root_verity = mfree(c->root_verity);
6138 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
6139 c->extension_directories = strv_free(c->extension_directories);
6140 c->tty_path = mfree(c->tty_path);
6141 c->syslog_identifier = mfree(c->syslog_identifier);
6142 c->user = mfree(c->user);
6143 c->group = mfree(c->group);
6144
6145 c->supplementary_groups = strv_free(c->supplementary_groups);
6146
6147 c->pam_name = mfree(c->pam_name);
6148
6149 c->read_only_paths = strv_free(c->read_only_paths);
6150 c->read_write_paths = strv_free(c->read_write_paths);
6151 c->inaccessible_paths = strv_free(c->inaccessible_paths);
6152 c->exec_paths = strv_free(c->exec_paths);
6153 c->no_exec_paths = strv_free(c->no_exec_paths);
6154 c->exec_search_path = strv_free(c->exec_search_path);
6155
6156 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
6157 c->bind_mounts = NULL;
6158 c->n_bind_mounts = 0;
6159 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
6160 c->temporary_filesystems = NULL;
6161 c->n_temporary_filesystems = 0;
6162 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
6163
6164 cpu_set_reset(&c->cpu_set);
6165 numa_policy_reset(&c->numa_policy);
6166
6167 c->utmp_id = mfree(c->utmp_id);
6168 c->selinux_context = mfree(c->selinux_context);
6169 c->apparmor_profile = mfree(c->apparmor_profile);
6170 c->smack_process_label = mfree(c->smack_process_label);
6171
6172 c->restrict_filesystems = set_free_free(c->restrict_filesystems);
6173
6174 c->syscall_filter = hashmap_free(c->syscall_filter);
6175 c->syscall_archs = set_free(c->syscall_archs);
6176 c->address_families = set_free(c->address_families);
6177
6178 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6179 exec_directory_done(&c->directories[t]);
6180
6181 c->log_level_max = -1;
6182
6183 exec_context_free_log_extra_fields(c);
6184 c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
6185 c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
6186
6187 c->log_ratelimit_interval_usec = 0;
6188 c->log_ratelimit_burst = 0;
6189
6190 c->stdin_data = mfree(c->stdin_data);
6191 c->stdin_data_size = 0;
6192
6193 c->network_namespace_path = mfree(c->network_namespace_path);
6194 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
6195
6196 c->log_namespace = mfree(c->log_namespace);
6197
6198 c->load_credentials = hashmap_free(c->load_credentials);
6199 c->set_credentials = hashmap_free(c->set_credentials);
6200 c->import_credentials = set_free_free(c->import_credentials);
6201
6202 c->root_image_policy = image_policy_free(c->root_image_policy);
6203 c->mount_image_policy = image_policy_free(c->mount_image_policy);
6204 c->extension_image_policy = image_policy_free(c->extension_image_policy);
6205 }
6206
6207 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
6208 assert(c);
6209
6210 if (!runtime_prefix)
6211 return 0;
6212
6213 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
6214 _cleanup_free_ char *p = NULL;
6215
6216 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6217 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6218 else
6219 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6220 if (!p)
6221 return -ENOMEM;
6222
6223 /* We execute this synchronously, since we need to be sure this is gone when we start the
6224 * service next. */
6225 (void) rm_rf(p, REMOVE_ROOT);
6226
6227 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
6228 _cleanup_free_ char *symlink_abs = NULL;
6229
6230 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6231 symlink_abs = path_join(runtime_prefix, "private", *symlink);
6232 else
6233 symlink_abs = path_join(runtime_prefix, *symlink);
6234 if (!symlink_abs)
6235 return -ENOMEM;
6236
6237 (void) unlink(symlink_abs);
6238 }
6239 }
6240
6241 return 0;
6242 }
6243
6244 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
6245 _cleanup_free_ char *p = NULL;
6246
6247 assert(c);
6248
6249 if (!runtime_prefix || !unit)
6250 return 0;
6251
6252 p = path_join(runtime_prefix, "credentials", unit);
6253 if (!p)
6254 return -ENOMEM;
6255
6256 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6257 * unmount it, and afterwards remove the mount point */
6258 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6259 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
6260
6261 return 0;
6262 }
6263
6264 int exec_context_destroy_mount_ns_dir(Unit *u) {
6265 _cleanup_free_ char *p = NULL;
6266
6267 if (!u || !MANAGER_IS_SYSTEM(u->manager))
6268 return 0;
6269
6270 p = path_join("/run/systemd/propagate/", u->id);
6271 if (!p)
6272 return -ENOMEM;
6273
6274 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6275 if (rmdir(p) < 0 && errno != ENOENT)
6276 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
6277
6278 return 0;
6279 }
6280
6281 static void exec_command_done(ExecCommand *c) {
6282 assert(c);
6283
6284 c->path = mfree(c->path);
6285 c->argv = strv_free(c->argv);
6286 }
6287
6288 void exec_command_done_array(ExecCommand *c, size_t n) {
6289 for (size_t i = 0; i < n; i++)
6290 exec_command_done(c+i);
6291 }
6292
6293 ExecCommand* exec_command_free_list(ExecCommand *c) {
6294 ExecCommand *i;
6295
6296 while ((i = c)) {
6297 LIST_REMOVE(command, c, i);
6298 exec_command_done(i);
6299 free(i);
6300 }
6301
6302 return NULL;
6303 }
6304
6305 void exec_command_free_array(ExecCommand **c, size_t n) {
6306 for (size_t i = 0; i < n; i++)
6307 c[i] = exec_command_free_list(c[i]);
6308 }
6309
6310 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
6311 for (size_t i = 0; i < n; i++)
6312 exec_status_reset(&c[i].exec_status);
6313 }
6314
6315 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
6316 for (size_t i = 0; i < n; i++)
6317 LIST_FOREACH(command, z, c[i])
6318 exec_status_reset(&z->exec_status);
6319 }
6320
6321 typedef struct InvalidEnvInfo {
6322 const Unit *unit;
6323 const char *path;
6324 } InvalidEnvInfo;
6325
6326 static void invalid_env(const char *p, void *userdata) {
6327 InvalidEnvInfo *info = userdata;
6328
6329 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
6330 }
6331
6332 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
6333 assert(c);
6334
6335 switch (fd_index) {
6336
6337 case STDIN_FILENO:
6338 if (c->std_input != EXEC_INPUT_NAMED_FD)
6339 return NULL;
6340
6341 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
6342
6343 case STDOUT_FILENO:
6344 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
6345 return NULL;
6346
6347 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
6348
6349 case STDERR_FILENO:
6350 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
6351 return NULL;
6352
6353 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
6354
6355 default:
6356 return NULL;
6357 }
6358 }
6359
6360 static int exec_context_named_iofds(
6361 const ExecContext *c,
6362 const ExecParameters *p,
6363 int named_iofds[static 3]) {
6364
6365 size_t targets;
6366 const char* stdio_fdname[3];
6367 size_t n_fds;
6368
6369 assert(c);
6370 assert(p);
6371 assert(named_iofds);
6372
6373 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
6374 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
6375 (c->std_error == EXEC_OUTPUT_NAMED_FD);
6376
6377 for (size_t i = 0; i < 3; i++)
6378 stdio_fdname[i] = exec_context_fdname(c, i);
6379
6380 n_fds = p->n_storage_fds + p->n_socket_fds;
6381
6382 for (size_t i = 0; i < n_fds && targets > 0; i++)
6383 if (named_iofds[STDIN_FILENO] < 0 &&
6384 c->std_input == EXEC_INPUT_NAMED_FD &&
6385 stdio_fdname[STDIN_FILENO] &&
6386 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
6387
6388 named_iofds[STDIN_FILENO] = p->fds[i];
6389 targets--;
6390
6391 } else if (named_iofds[STDOUT_FILENO] < 0 &&
6392 c->std_output == EXEC_OUTPUT_NAMED_FD &&
6393 stdio_fdname[STDOUT_FILENO] &&
6394 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
6395
6396 named_iofds[STDOUT_FILENO] = p->fds[i];
6397 targets--;
6398
6399 } else if (named_iofds[STDERR_FILENO] < 0 &&
6400 c->std_error == EXEC_OUTPUT_NAMED_FD &&
6401 stdio_fdname[STDERR_FILENO] &&
6402 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
6403
6404 named_iofds[STDERR_FILENO] = p->fds[i];
6405 targets--;
6406 }
6407
6408 return targets == 0 ? 0 : -ENOENT;
6409 }
6410
6411 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
6412 _cleanup_strv_free_ char **v = NULL;
6413 int r;
6414
6415 assert(c);
6416 assert(ret);
6417
6418 STRV_FOREACH(i, c->environment_files) {
6419 _cleanup_globfree_ glob_t pglob = {};
6420 bool ignore = false;
6421 char *fn = *i;
6422
6423 if (fn[0] == '-') {
6424 ignore = true;
6425 fn++;
6426 }
6427
6428 if (!path_is_absolute(fn)) {
6429 if (ignore)
6430 continue;
6431 return -EINVAL;
6432 }
6433
6434 /* Filename supports globbing, take all matching files */
6435 r = safe_glob(fn, 0, &pglob);
6436 if (r < 0) {
6437 if (ignore)
6438 continue;
6439 return r;
6440 }
6441
6442 /* When we don't match anything, -ENOENT should be returned */
6443 assert(pglob.gl_pathc > 0);
6444
6445 for (size_t n = 0; n < pglob.gl_pathc; n++) {
6446 _cleanup_strv_free_ char **p = NULL;
6447
6448 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
6449 if (r < 0) {
6450 if (ignore)
6451 continue;
6452 return r;
6453 }
6454
6455 /* Log invalid environment variables with filename */
6456 if (p) {
6457 InvalidEnvInfo info = {
6458 .unit = unit,
6459 .path = pglob.gl_pathv[n]
6460 };
6461
6462 p = strv_env_clean_with_callback(p, invalid_env, &info);
6463 }
6464
6465 if (!v)
6466 v = TAKE_PTR(p);
6467 else {
6468 char **m = strv_env_merge(v, p);
6469 if (!m)
6470 return -ENOMEM;
6471
6472 strv_free_and_replace(v, m);
6473 }
6474 }
6475 }
6476
6477 *ret = TAKE_PTR(v);
6478
6479 return 0;
6480 }
6481
6482 static bool tty_may_match_dev_console(const char *tty) {
6483 _cleanup_free_ char *resolved = NULL;
6484
6485 if (!tty)
6486 return true;
6487
6488 tty = skip_dev_prefix(tty);
6489
6490 /* trivial identity? */
6491 if (streq(tty, "console"))
6492 return true;
6493
6494 if (resolve_dev_console(&resolved) < 0)
6495 return true; /* if we could not resolve, assume it may */
6496
6497 /* "tty0" means the active VC, so it may be the same sometimes */
6498 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6499 }
6500
6501 static bool exec_context_may_touch_tty(const ExecContext *ec) {
6502 assert(ec);
6503
6504 return ec->tty_reset ||
6505 ec->tty_vhangup ||
6506 ec->tty_vt_disallocate ||
6507 is_terminal_input(ec->std_input) ||
6508 is_terminal_output(ec->std_output) ||
6509 is_terminal_output(ec->std_error);
6510 }
6511
6512 bool exec_context_may_touch_console(const ExecContext *ec) {
6513
6514 return exec_context_may_touch_tty(ec) &&
6515 tty_may_match_dev_console(exec_context_tty_path(ec));
6516 }
6517
6518 static void strv_fprintf(FILE *f, char **l) {
6519 assert(f);
6520
6521 STRV_FOREACH(g, l)
6522 fprintf(f, " %s", *g);
6523 }
6524
6525 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6526 assert(f);
6527 assert(prefix);
6528 assert(name);
6529
6530 if (!strv_isempty(strv)) {
6531 fprintf(f, "%s%s:", prefix, name);
6532 strv_fprintf(f, strv);
6533 fputs("\n", f);
6534 }
6535 }
6536
6537 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
6538 int r;
6539
6540 assert(c);
6541 assert(f);
6542
6543 prefix = strempty(prefix);
6544
6545 fprintf(f,
6546 "%sUMask: %04o\n"
6547 "%sWorkingDirectory: %s\n"
6548 "%sRootDirectory: %s\n"
6549 "%sRootEphemeral: %s\n"
6550 "%sNonBlocking: %s\n"
6551 "%sPrivateTmp: %s\n"
6552 "%sPrivateDevices: %s\n"
6553 "%sProtectKernelTunables: %s\n"
6554 "%sProtectKernelModules: %s\n"
6555 "%sProtectKernelLogs: %s\n"
6556 "%sProtectClock: %s\n"
6557 "%sProtectControlGroups: %s\n"
6558 "%sPrivateNetwork: %s\n"
6559 "%sPrivateUsers: %s\n"
6560 "%sProtectHome: %s\n"
6561 "%sProtectSystem: %s\n"
6562 "%sMountAPIVFS: %s\n"
6563 "%sIgnoreSIGPIPE: %s\n"
6564 "%sMemoryDenyWriteExecute: %s\n"
6565 "%sRestrictRealtime: %s\n"
6566 "%sRestrictSUIDSGID: %s\n"
6567 "%sKeyringMode: %s\n"
6568 "%sProtectHostname: %s\n"
6569 "%sProtectProc: %s\n"
6570 "%sProcSubset: %s\n",
6571 prefix, c->umask,
6572 prefix, empty_to_root(c->working_directory),
6573 prefix, empty_to_root(c->root_directory),
6574 prefix, yes_no(c->root_ephemeral),
6575 prefix, yes_no(c->non_blocking),
6576 prefix, yes_no(c->private_tmp),
6577 prefix, yes_no(c->private_devices),
6578 prefix, yes_no(c->protect_kernel_tunables),
6579 prefix, yes_no(c->protect_kernel_modules),
6580 prefix, yes_no(c->protect_kernel_logs),
6581 prefix, yes_no(c->protect_clock),
6582 prefix, yes_no(c->protect_control_groups),
6583 prefix, yes_no(c->private_network),
6584 prefix, yes_no(c->private_users),
6585 prefix, protect_home_to_string(c->protect_home),
6586 prefix, protect_system_to_string(c->protect_system),
6587 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
6588 prefix, yes_no(c->ignore_sigpipe),
6589 prefix, yes_no(c->memory_deny_write_execute),
6590 prefix, yes_no(c->restrict_realtime),
6591 prefix, yes_no(c->restrict_suid_sgid),
6592 prefix, exec_keyring_mode_to_string(c->keyring_mode),
6593 prefix, yes_no(c->protect_hostname),
6594 prefix, protect_proc_to_string(c->protect_proc),
6595 prefix, proc_subset_to_string(c->proc_subset));
6596
6597 if (c->root_image)
6598 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6599
6600 if (c->root_image_options) {
6601 fprintf(f, "%sRootImageOptions:", prefix);
6602 LIST_FOREACH(mount_options, o, c->root_image_options)
6603 if (!isempty(o->options))
6604 fprintf(f, " %s:%s",
6605 partition_designator_to_string(o->partition_designator),
6606 o->options);
6607 fprintf(f, "\n");
6608 }
6609
6610 if (c->root_hash) {
6611 _cleanup_free_ char *encoded = NULL;
6612 encoded = hexmem(c->root_hash, c->root_hash_size);
6613 if (encoded)
6614 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6615 }
6616
6617 if (c->root_hash_path)
6618 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6619
6620 if (c->root_hash_sig) {
6621 _cleanup_free_ char *encoded = NULL;
6622 ssize_t len;
6623 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6624 if (len)
6625 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6626 }
6627
6628 if (c->root_hash_sig_path)
6629 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6630
6631 if (c->root_verity)
6632 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6633
6634 STRV_FOREACH(e, c->environment)
6635 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6636
6637 STRV_FOREACH(e, c->environment_files)
6638 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
6639
6640 STRV_FOREACH(e, c->pass_environment)
6641 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6642
6643 STRV_FOREACH(e, c->unset_environment)
6644 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6645
6646 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6647
6648 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
6649 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6650
6651 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6652 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6653
6654 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6655 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6656 }
6657 }
6658
6659 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6660
6661 if (c->nice_set)
6662 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6663
6664 if (c->oom_score_adjust_set)
6665 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6666
6667 if (c->coredump_filter_set)
6668 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6669
6670 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6671 if (c->rlimit[i]) {
6672 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6673 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6674 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6675 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6676 }
6677
6678 if (c->ioprio_set) {
6679 _cleanup_free_ char *class_str = NULL;
6680
6681 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6682 if (r >= 0)
6683 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6684
6685 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6686 }
6687
6688 if (c->cpu_sched_set) {
6689 _cleanup_free_ char *policy_str = NULL;
6690
6691 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6692 if (r >= 0)
6693 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6694
6695 fprintf(f,
6696 "%sCPUSchedulingPriority: %i\n"
6697 "%sCPUSchedulingResetOnFork: %s\n",
6698 prefix, c->cpu_sched_priority,
6699 prefix, yes_no(c->cpu_sched_reset_on_fork));
6700 }
6701
6702 if (c->cpu_set.set) {
6703 _cleanup_free_ char *affinity = NULL;
6704
6705 affinity = cpu_set_to_range_string(&c->cpu_set);
6706 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6707 }
6708
6709 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6710 _cleanup_free_ char *nodes = NULL;
6711
6712 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6713 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6714 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6715 }
6716
6717 if (c->timer_slack_nsec != NSEC_INFINITY)
6718 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6719
6720 fprintf(f,
6721 "%sStandardInput: %s\n"
6722 "%sStandardOutput: %s\n"
6723 "%sStandardError: %s\n",
6724 prefix, exec_input_to_string(c->std_input),
6725 prefix, exec_output_to_string(c->std_output),
6726 prefix, exec_output_to_string(c->std_error));
6727
6728 if (c->std_input == EXEC_INPUT_NAMED_FD)
6729 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6730 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6731 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6732 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6733 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6734
6735 if (c->std_input == EXEC_INPUT_FILE)
6736 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6737 if (c->std_output == EXEC_OUTPUT_FILE)
6738 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6739 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6740 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6741 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6742 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6743 if (c->std_error == EXEC_OUTPUT_FILE)
6744 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6745 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6746 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6747 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6748 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6749
6750 if (c->tty_path)
6751 fprintf(f,
6752 "%sTTYPath: %s\n"
6753 "%sTTYReset: %s\n"
6754 "%sTTYVHangup: %s\n"
6755 "%sTTYVTDisallocate: %s\n"
6756 "%sTTYRows: %u\n"
6757 "%sTTYColumns: %u\n",
6758 prefix, c->tty_path,
6759 prefix, yes_no(c->tty_reset),
6760 prefix, yes_no(c->tty_vhangup),
6761 prefix, yes_no(c->tty_vt_disallocate),
6762 prefix, c->tty_rows,
6763 prefix, c->tty_cols);
6764
6765 if (IN_SET(c->std_output,
6766 EXEC_OUTPUT_KMSG,
6767 EXEC_OUTPUT_JOURNAL,
6768 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6769 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6770 IN_SET(c->std_error,
6771 EXEC_OUTPUT_KMSG,
6772 EXEC_OUTPUT_JOURNAL,
6773 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6774 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6775
6776 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6777
6778 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6779 if (r >= 0)
6780 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6781
6782 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6783 if (r >= 0)
6784 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6785 }
6786
6787 if (c->log_level_max >= 0) {
6788 _cleanup_free_ char *t = NULL;
6789
6790 (void) log_level_to_string_alloc(c->log_level_max, &t);
6791
6792 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6793 }
6794
6795 if (c->log_ratelimit_interval_usec > 0)
6796 fprintf(f,
6797 "%sLogRateLimitIntervalSec: %s\n",
6798 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6799
6800 if (c->log_ratelimit_burst > 0)
6801 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6802
6803 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6804 fprintf(f, "%sLogFilterPatterns:", prefix);
6805
6806 char *pattern;
6807 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6808 fprintf(f, " %s", pattern);
6809 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6810 fprintf(f, " ~%s", pattern);
6811 fputc('\n', f);
6812 }
6813
6814 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6815 fprintf(f, "%sLogExtraFields: ", prefix);
6816 fwrite(c->log_extra_fields[j].iov_base,
6817 1, c->log_extra_fields[j].iov_len,
6818 f);
6819 fputc('\n', f);
6820 }
6821
6822 if (c->log_namespace)
6823 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6824
6825 if (c->secure_bits) {
6826 _cleanup_free_ char *str = NULL;
6827
6828 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6829 if (r >= 0)
6830 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6831 }
6832
6833 if (c->capability_bounding_set != CAP_MASK_UNSET) {
6834 _cleanup_free_ char *str = NULL;
6835
6836 r = capability_set_to_string(c->capability_bounding_set, &str);
6837 if (r >= 0)
6838 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6839 }
6840
6841 if (c->capability_ambient_set != 0) {
6842 _cleanup_free_ char *str = NULL;
6843
6844 r = capability_set_to_string(c->capability_ambient_set, &str);
6845 if (r >= 0)
6846 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6847 }
6848
6849 if (c->user)
6850 fprintf(f, "%sUser: %s\n", prefix, c->user);
6851 if (c->group)
6852 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6853
6854 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6855
6856 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6857
6858 if (c->pam_name)
6859 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6860
6861 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6862 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6863 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6864 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6865 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6866 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6867
6868 for (size_t i = 0; i < c->n_bind_mounts; i++)
6869 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6870 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6871 c->bind_mounts[i].ignore_enoent ? "-": "",
6872 c->bind_mounts[i].source,
6873 c->bind_mounts[i].destination,
6874 c->bind_mounts[i].recursive ? "rbind" : "norbind");
6875
6876 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6877 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6878
6879 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6880 t->path,
6881 isempty(t->options) ? "" : ":",
6882 strempty(t->options));
6883 }
6884
6885 if (c->utmp_id)
6886 fprintf(f,
6887 "%sUtmpIdentifier: %s\n",
6888 prefix, c->utmp_id);
6889
6890 if (c->selinux_context)
6891 fprintf(f,
6892 "%sSELinuxContext: %s%s\n",
6893 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6894
6895 if (c->apparmor_profile)
6896 fprintf(f,
6897 "%sAppArmorProfile: %s%s\n",
6898 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6899
6900 if (c->smack_process_label)
6901 fprintf(f,
6902 "%sSmackProcessLabel: %s%s\n",
6903 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6904
6905 if (c->personality != PERSONALITY_INVALID)
6906 fprintf(f,
6907 "%sPersonality: %s\n",
6908 prefix, strna(personality_to_string(c->personality)));
6909
6910 fprintf(f,
6911 "%sLockPersonality: %s\n",
6912 prefix, yes_no(c->lock_personality));
6913
6914 if (c->syscall_filter) {
6915 fprintf(f,
6916 "%sSystemCallFilter: ",
6917 prefix);
6918
6919 if (!c->syscall_allow_list)
6920 fputc('~', f);
6921
6922 #if HAVE_SECCOMP
6923 void *id, *val;
6924 bool first = true;
6925 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6926 _cleanup_free_ char *name = NULL;
6927 const char *errno_name = NULL;
6928 int num = PTR_TO_INT(val);
6929
6930 if (first)
6931 first = false;
6932 else
6933 fputc(' ', f);
6934
6935 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6936 fputs(strna(name), f);
6937
6938 if (num >= 0) {
6939 errno_name = seccomp_errno_or_action_to_string(num);
6940 if (errno_name)
6941 fprintf(f, ":%s", errno_name);
6942 else
6943 fprintf(f, ":%d", num);
6944 }
6945 }
6946 #endif
6947
6948 fputc('\n', f);
6949 }
6950
6951 if (c->syscall_archs) {
6952 fprintf(f,
6953 "%sSystemCallArchitectures:",
6954 prefix);
6955
6956 #if HAVE_SECCOMP
6957 void *id;
6958 SET_FOREACH(id, c->syscall_archs)
6959 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6960 #endif
6961 fputc('\n', f);
6962 }
6963
6964 if (exec_context_restrict_namespaces_set(c)) {
6965 _cleanup_free_ char *s = NULL;
6966
6967 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6968 if (r >= 0)
6969 fprintf(f, "%sRestrictNamespaces: %s\n",
6970 prefix, strna(s));
6971 }
6972
6973 #if HAVE_LIBBPF
6974 if (exec_context_restrict_filesystems_set(c)) {
6975 char *fs;
6976 SET_FOREACH(fs, c->restrict_filesystems)
6977 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6978 }
6979 #endif
6980
6981 if (c->network_namespace_path)
6982 fprintf(f,
6983 "%sNetworkNamespacePath: %s\n",
6984 prefix, c->network_namespace_path);
6985
6986 if (c->syscall_errno > 0) {
6987 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6988
6989 #if HAVE_SECCOMP
6990 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6991 if (errno_name)
6992 fputs(errno_name, f);
6993 else
6994 fprintf(f, "%d", c->syscall_errno);
6995 #endif
6996 fputc('\n', f);
6997 }
6998
6999 for (size_t i = 0; i < c->n_mount_images; i++) {
7000 fprintf(f, "%sMountImages: %s%s:%s", prefix,
7001 c->mount_images[i].ignore_enoent ? "-": "",
7002 c->mount_images[i].source,
7003 c->mount_images[i].destination);
7004 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
7005 fprintf(f, ":%s:%s",
7006 partition_designator_to_string(o->partition_designator),
7007 strempty(o->options));
7008 fprintf(f, "\n");
7009 }
7010
7011 for (size_t i = 0; i < c->n_extension_images; i++) {
7012 fprintf(f, "%sExtensionImages: %s%s", prefix,
7013 c->extension_images[i].ignore_enoent ? "-": "",
7014 c->extension_images[i].source);
7015 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
7016 fprintf(f, ":%s:%s",
7017 partition_designator_to_string(o->partition_designator),
7018 strempty(o->options));
7019 fprintf(f, "\n");
7020 }
7021
7022 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
7023 }
7024
7025 bool exec_context_maintains_privileges(const ExecContext *c) {
7026 assert(c);
7027
7028 /* Returns true if the process forked off would run under
7029 * an unchanged UID or as root. */
7030
7031 if (!c->user)
7032 return true;
7033
7034 if (streq(c->user, "root") || streq(c->user, "0"))
7035 return true;
7036
7037 return false;
7038 }
7039
7040 int exec_context_get_effective_ioprio(const ExecContext *c) {
7041 int p;
7042
7043 assert(c);
7044
7045 if (c->ioprio_set)
7046 return c->ioprio;
7047
7048 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
7049 if (p < 0)
7050 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7051
7052 return ioprio_normalize(p);
7053 }
7054
7055 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
7056 assert(c);
7057
7058 /* Explicit setting wins */
7059 if (c->mount_apivfs_set)
7060 return c->mount_apivfs;
7061
7062 /* Default to "yes" if root directory or image are specified */
7063 if (exec_context_with_rootfs(c))
7064 return true;
7065
7066 return false;
7067 }
7068
7069 void exec_context_free_log_extra_fields(ExecContext *c) {
7070 assert(c);
7071
7072 for (size_t l = 0; l < c->n_log_extra_fields; l++)
7073 free(c->log_extra_fields[l].iov_base);
7074 c->log_extra_fields = mfree(c->log_extra_fields);
7075 c->n_log_extra_fields = 0;
7076 }
7077
7078 void exec_context_revert_tty(ExecContext *c) {
7079 _cleanup_close_ int fd = -EBADF;
7080 const char *path;
7081 struct stat st;
7082 int r;
7083
7084 assert(c);
7085
7086 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7087 exec_context_tty_reset(c, NULL);
7088
7089 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7090 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7091 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7092 if (!exec_context_may_touch_tty(c))
7093 return;
7094
7095 path = exec_context_tty_path(c);
7096 if (!path)
7097 return;
7098
7099 fd = open(path, O_PATH|O_CLOEXEC);
7100 if (fd < 0)
7101 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
7102 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7103 path);
7104
7105 if (fstat(fd, &st) < 0)
7106 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
7107
7108 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7109 * if things are a character device, since a proper check either means we'd have to open the TTY and
7110 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7111 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7112 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7113 if (!S_ISCHR(st.st_mode))
7114 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
7115
7116 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
7117 if (r < 0)
7118 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
7119 }
7120
7121 int exec_context_get_clean_directories(
7122 ExecContext *c,
7123 char **prefix,
7124 ExecCleanMask mask,
7125 char ***ret) {
7126
7127 _cleanup_strv_free_ char **l = NULL;
7128 int r;
7129
7130 assert(c);
7131 assert(prefix);
7132 assert(ret);
7133
7134 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
7135 if (!FLAGS_SET(mask, 1U << t))
7136 continue;
7137
7138 if (!prefix[t])
7139 continue;
7140
7141 for (size_t i = 0; i < c->directories[t].n_items; i++) {
7142 char *j;
7143
7144 j = path_join(prefix[t], c->directories[t].items[i].path);
7145 if (!j)
7146 return -ENOMEM;
7147
7148 r = strv_consume(&l, j);
7149 if (r < 0)
7150 return r;
7151
7152 /* Also remove private directories unconditionally. */
7153 if (t != EXEC_DIRECTORY_CONFIGURATION) {
7154 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
7155 if (!j)
7156 return -ENOMEM;
7157
7158 r = strv_consume(&l, j);
7159 if (r < 0)
7160 return r;
7161 }
7162
7163 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
7164 j = path_join(prefix[t], *symlink);
7165 if (!j)
7166 return -ENOMEM;
7167
7168 r = strv_consume(&l, j);
7169 if (r < 0)
7170 return r;
7171 }
7172 }
7173 }
7174
7175 *ret = TAKE_PTR(l);
7176 return 0;
7177 }
7178
7179 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
7180 ExecCleanMask mask = 0;
7181
7182 assert(c);
7183 assert(ret);
7184
7185 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
7186 if (c->directories[t].n_items > 0)
7187 mask |= 1U << t;
7188
7189 *ret = mask;
7190 return 0;
7191 }
7192
7193 bool exec_context_has_encrypted_credentials(ExecContext *c) {
7194 ExecLoadCredential *load_cred;
7195 ExecSetCredential *set_cred;
7196
7197 assert(c);
7198
7199 HASHMAP_FOREACH(load_cred, c->load_credentials)
7200 if (load_cred->encrypted)
7201 return true;
7202
7203 HASHMAP_FOREACH(set_cred, c->set_credentials)
7204 if (set_cred->encrypted)
7205 return true;
7206
7207 return false;
7208 }
7209
7210 void exec_status_start(ExecStatus *s, pid_t pid) {
7211 assert(s);
7212
7213 *s = (ExecStatus) {
7214 .pid = pid,
7215 };
7216
7217 dual_timestamp_get(&s->start_timestamp);
7218 }
7219
7220 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
7221 assert(s);
7222
7223 if (s->pid != pid)
7224 *s = (ExecStatus) {
7225 .pid = pid,
7226 };
7227
7228 dual_timestamp_get(&s->exit_timestamp);
7229
7230 s->code = code;
7231 s->status = status;
7232
7233 if (context && context->utmp_id)
7234 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
7235 }
7236
7237 void exec_status_reset(ExecStatus *s) {
7238 assert(s);
7239
7240 *s = (ExecStatus) {};
7241 }
7242
7243 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
7244 assert(s);
7245 assert(f);
7246
7247 if (s->pid <= 0)
7248 return;
7249
7250 prefix = strempty(prefix);
7251
7252 fprintf(f,
7253 "%sPID: "PID_FMT"\n",
7254 prefix, s->pid);
7255
7256 if (dual_timestamp_is_set(&s->start_timestamp))
7257 fprintf(f,
7258 "%sStart Timestamp: %s\n",
7259 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
7260
7261 if (dual_timestamp_is_set(&s->exit_timestamp))
7262 fprintf(f,
7263 "%sExit Timestamp: %s\n"
7264 "%sExit Code: %s\n"
7265 "%sExit Status: %i\n",
7266 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
7267 prefix, sigchld_code_to_string(s->code),
7268 prefix, s->status);
7269 }
7270
7271 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
7272 _cleanup_free_ char *cmd = NULL;
7273 const char *prefix2;
7274
7275 assert(c);
7276 assert(f);
7277
7278 prefix = strempty(prefix);
7279 prefix2 = strjoina(prefix, "\t");
7280
7281 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
7282
7283 fprintf(f,
7284 "%sCommand Line: %s\n",
7285 prefix, strnull(cmd));
7286
7287 exec_status_dump(&c->exec_status, f, prefix2);
7288 }
7289
7290 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
7291 assert(f);
7292
7293 prefix = strempty(prefix);
7294
7295 LIST_FOREACH(command, i, c)
7296 exec_command_dump(i, f, prefix);
7297 }
7298
7299 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
7300 ExecCommand *end;
7301
7302 assert(l);
7303 assert(e);
7304
7305 if (*l) {
7306 /* It's kind of important, that we keep the order here */
7307 end = LIST_FIND_TAIL(command, *l);
7308 LIST_INSERT_AFTER(command, *l, end, e);
7309 } else
7310 *l = e;
7311 }
7312
7313 int exec_command_set(ExecCommand *c, const char *path, ...) {
7314 va_list ap;
7315 char **l, *p;
7316
7317 assert(c);
7318 assert(path);
7319
7320 va_start(ap, path);
7321 l = strv_new_ap(path, ap);
7322 va_end(ap);
7323
7324 if (!l)
7325 return -ENOMEM;
7326
7327 p = strdup(path);
7328 if (!p) {
7329 strv_free(l);
7330 return -ENOMEM;
7331 }
7332
7333 free_and_replace(c->path, p);
7334
7335 return strv_free_and_replace(c->argv, l);
7336 }
7337
7338 int exec_command_append(ExecCommand *c, const char *path, ...) {
7339 _cleanup_strv_free_ char **l = NULL;
7340 va_list ap;
7341 int r;
7342
7343 assert(c);
7344 assert(path);
7345
7346 va_start(ap, path);
7347 l = strv_new_ap(path, ap);
7348 va_end(ap);
7349
7350 if (!l)
7351 return -ENOMEM;
7352
7353 r = strv_extend_strv(&c->argv, l, false);
7354 if (r < 0)
7355 return r;
7356
7357 return 0;
7358 }
7359
7360 static char *destroy_tree(char *path) {
7361 if (!path)
7362 return NULL;
7363
7364 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
7365 log_debug("Spawning process to nuke '%s'", path);
7366
7367 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
7368 }
7369
7370 return mfree(path);
7371 }
7372
7373 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
7374 if (!rt)
7375 return NULL;
7376
7377 if (rt->manager)
7378 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
7379
7380 rt->id = mfree(rt->id);
7381 rt->tmp_dir = mfree(rt->tmp_dir);
7382 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
7383 safe_close_pair(rt->netns_storage_socket);
7384 safe_close_pair(rt->ipcns_storage_socket);
7385 return mfree(rt);
7386 }
7387
7388 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
7389 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
7390
7391 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
7392 if (!rt)
7393 return NULL;
7394
7395 assert(rt->n_ref > 0);
7396 rt->n_ref--;
7397
7398 if (rt->n_ref > 0)
7399 return NULL;
7400
7401 rt->tmp_dir = destroy_tree(rt->tmp_dir);
7402 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
7403
7404 return exec_shared_runtime_free(rt);
7405 }
7406
7407 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
7408 _cleanup_free_ char *id_copy = NULL;
7409 ExecSharedRuntime *n;
7410
7411 assert(ret);
7412
7413 id_copy = strdup(id);
7414 if (!id_copy)
7415 return -ENOMEM;
7416
7417 n = new(ExecSharedRuntime, 1);
7418 if (!n)
7419 return -ENOMEM;
7420
7421 *n = (ExecSharedRuntime) {
7422 .id = TAKE_PTR(id_copy),
7423 .netns_storage_socket = PIPE_EBADF,
7424 .ipcns_storage_socket = PIPE_EBADF,
7425 };
7426
7427 *ret = n;
7428 return 0;
7429 }
7430
7431 static int exec_shared_runtime_add(
7432 Manager *m,
7433 const char *id,
7434 char **tmp_dir,
7435 char **var_tmp_dir,
7436 int netns_storage_socket[2],
7437 int ipcns_storage_socket[2],
7438 ExecSharedRuntime **ret) {
7439
7440 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
7441 int r;
7442
7443 assert(m);
7444 assert(id);
7445
7446 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7447
7448 r = exec_shared_runtime_allocate(&rt, id);
7449 if (r < 0)
7450 return r;
7451
7452 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
7453 if (r < 0)
7454 return r;
7455
7456 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
7457 rt->tmp_dir = TAKE_PTR(*tmp_dir);
7458 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
7459
7460 if (netns_storage_socket) {
7461 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
7462 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
7463 }
7464
7465 if (ipcns_storage_socket) {
7466 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
7467 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
7468 }
7469
7470 rt->manager = m;
7471
7472 if (ret)
7473 *ret = rt;
7474 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7475 TAKE_PTR(rt);
7476 return 0;
7477 }
7478
7479 static int exec_shared_runtime_make(
7480 Manager *m,
7481 const ExecContext *c,
7482 const char *id,
7483 ExecSharedRuntime **ret) {
7484
7485 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
7486 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
7487 int r;
7488
7489 assert(m);
7490 assert(c);
7491 assert(id);
7492
7493 /* It is not necessary to create ExecSharedRuntime object. */
7494 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
7495 *ret = NULL;
7496 return 0;
7497 }
7498
7499 if (c->private_tmp &&
7500 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
7501 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
7502 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
7503 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
7504 if (r < 0)
7505 return r;
7506 }
7507
7508 if (exec_needs_network_namespace(c)) {
7509 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
7510 return -errno;
7511 }
7512
7513 if (exec_needs_ipc_namespace(c)) {
7514 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
7515 return -errno;
7516 }
7517
7518 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
7519 if (r < 0)
7520 return r;
7521
7522 return 1;
7523 }
7524
7525 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
7526 ExecSharedRuntime *rt;
7527 int r;
7528
7529 assert(m);
7530 assert(id);
7531 assert(ret);
7532
7533 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
7534 if (rt)
7535 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7536 goto ref;
7537
7538 if (!create) {
7539 *ret = NULL;
7540 return 0;
7541 }
7542
7543 /* If not found, then create a new object. */
7544 r = exec_shared_runtime_make(m, c, id, &rt);
7545 if (r < 0)
7546 return r;
7547 if (r == 0) {
7548 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7549 *ret = NULL;
7550 return 0;
7551 }
7552
7553 ref:
7554 /* increment reference counter. */
7555 rt->n_ref++;
7556 *ret = rt;
7557 return 1;
7558 }
7559
7560 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7561 ExecSharedRuntime *rt;
7562
7563 assert(m);
7564 assert(f);
7565 assert(fds);
7566
7567 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7568 fprintf(f, "exec-runtime=%s", rt->id);
7569
7570 if (rt->tmp_dir)
7571 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
7572
7573 if (rt->var_tmp_dir)
7574 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
7575
7576 if (rt->netns_storage_socket[0] >= 0) {
7577 int copy;
7578
7579 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7580 if (copy < 0)
7581 return copy;
7582
7583 fprintf(f, " netns-socket-0=%i", copy);
7584 }
7585
7586 if (rt->netns_storage_socket[1] >= 0) {
7587 int copy;
7588
7589 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7590 if (copy < 0)
7591 return copy;
7592
7593 fprintf(f, " netns-socket-1=%i", copy);
7594 }
7595
7596 if (rt->ipcns_storage_socket[0] >= 0) {
7597 int copy;
7598
7599 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7600 if (copy < 0)
7601 return copy;
7602
7603 fprintf(f, " ipcns-socket-0=%i", copy);
7604 }
7605
7606 if (rt->ipcns_storage_socket[1] >= 0) {
7607 int copy;
7608
7609 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7610 if (copy < 0)
7611 return copy;
7612
7613 fprintf(f, " ipcns-socket-1=%i", copy);
7614 }
7615
7616 fputc('\n', f);
7617 }
7618
7619 return 0;
7620 }
7621
7622 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7623 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
7624 ExecSharedRuntime *rt;
7625 int r;
7626
7627 /* This is for the migration from old (v237 or earlier) deserialization text.
7628 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7629 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7630 * so or not from the serialized text, then we always creates a new object owned by this. */
7631
7632 assert(u);
7633 assert(key);
7634 assert(value);
7635
7636 /* Manager manages ExecSharedRuntime objects by the unit id.
7637 * So, we omit the serialized text when the unit does not have id (yet?)... */
7638 if (isempty(u->id)) {
7639 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7640 return 0;
7641 }
7642
7643 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
7644 return log_oom();
7645
7646 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
7647 if (!rt) {
7648 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
7649 return log_oom();
7650
7651 rt = rt_create;
7652 }
7653
7654 if (streq(key, "tmp-dir")) {
7655 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7656 return -ENOMEM;
7657
7658 } else if (streq(key, "var-tmp-dir")) {
7659 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7660 return -ENOMEM;
7661
7662 } else if (streq(key, "netns-socket-0")) {
7663 int fd;
7664
7665 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7666 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7667 return 0;
7668 }
7669
7670 safe_close(rt->netns_storage_socket[0]);
7671 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7672
7673 } else if (streq(key, "netns-socket-1")) {
7674 int fd;
7675
7676 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7677 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7678 return 0;
7679 }
7680
7681 safe_close(rt->netns_storage_socket[1]);
7682 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7683
7684 } else
7685 return 0;
7686
7687 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7688 if (rt_create) {
7689 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
7690 if (r < 0) {
7691 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7692 return 0;
7693 }
7694
7695 rt_create->manager = u->manager;
7696
7697 /* Avoid cleanup */
7698 TAKE_PTR(rt_create);
7699 }
7700
7701 return 1;
7702 }
7703
7704 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7705 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7706 char *id = NULL;
7707 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7708 const char *p, *v = ASSERT_PTR(value);
7709 size_t n;
7710
7711 assert(m);
7712 assert(fds);
7713
7714 n = strcspn(v, " ");
7715 id = strndupa_safe(v, n);
7716 if (v[n] != ' ')
7717 goto finalize;
7718 p = v + n + 1;
7719
7720 v = startswith(p, "tmp-dir=");
7721 if (v) {
7722 n = strcspn(v, " ");
7723 tmp_dir = strndup(v, n);
7724 if (!tmp_dir)
7725 return log_oom();
7726 if (v[n] != ' ')
7727 goto finalize;
7728 p = v + n + 1;
7729 }
7730
7731 v = startswith(p, "var-tmp-dir=");
7732 if (v) {
7733 n = strcspn(v, " ");
7734 var_tmp_dir = strndup(v, n);
7735 if (!var_tmp_dir)
7736 return log_oom();
7737 if (v[n] != ' ')
7738 goto finalize;
7739 p = v + n + 1;
7740 }
7741
7742 v = startswith(p, "netns-socket-0=");
7743 if (v) {
7744 char *buf;
7745
7746 n = strcspn(v, " ");
7747 buf = strndupa_safe(v, n);
7748
7749 netns_fdpair[0] = parse_fd(buf);
7750 if (netns_fdpair[0] < 0)
7751 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7752 if (!fdset_contains(fds, netns_fdpair[0]))
7753 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7754 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7755 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7756 if (v[n] != ' ')
7757 goto finalize;
7758 p = v + n + 1;
7759 }
7760
7761 v = startswith(p, "netns-socket-1=");
7762 if (v) {
7763 char *buf;
7764
7765 n = strcspn(v, " ");
7766 buf = strndupa_safe(v, n);
7767
7768 netns_fdpair[1] = parse_fd(buf);
7769 if (netns_fdpair[1] < 0)
7770 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7771 if (!fdset_contains(fds, netns_fdpair[1]))
7772 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7773 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7774 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7775 if (v[n] != ' ')
7776 goto finalize;
7777 p = v + n + 1;
7778 }
7779
7780 v = startswith(p, "ipcns-socket-0=");
7781 if (v) {
7782 char *buf;
7783
7784 n = strcspn(v, " ");
7785 buf = strndupa_safe(v, n);
7786
7787 ipcns_fdpair[0] = parse_fd(buf);
7788 if (ipcns_fdpair[0] < 0)
7789 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7790 if (!fdset_contains(fds, ipcns_fdpair[0]))
7791 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7792 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7793 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7794 if (v[n] != ' ')
7795 goto finalize;
7796 p = v + n + 1;
7797 }
7798
7799 v = startswith(p, "ipcns-socket-1=");
7800 if (v) {
7801 char *buf;
7802
7803 n = strcspn(v, " ");
7804 buf = strndupa_safe(v, n);
7805
7806 ipcns_fdpair[1] = parse_fd(buf);
7807 if (ipcns_fdpair[1] < 0)
7808 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7809 if (!fdset_contains(fds, ipcns_fdpair[1]))
7810 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7811 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7812 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7813 }
7814
7815 finalize:
7816 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7817 if (r < 0)
7818 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7819 return 0;
7820 }
7821
7822 void exec_shared_runtime_vacuum(Manager *m) {
7823 ExecSharedRuntime *rt;
7824
7825 assert(m);
7826
7827 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7828
7829 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7830 if (rt->n_ref > 0)
7831 continue;
7832
7833 (void) exec_shared_runtime_free(rt);
7834 }
7835 }
7836
7837 int exec_runtime_make(
7838 const Unit *unit,
7839 const ExecContext *context,
7840 ExecSharedRuntime *shared,
7841 DynamicCreds *creds,
7842 ExecRuntime **ret) {
7843 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
7844 _cleanup_free_ char *ephemeral = NULL;
7845 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
7846 int r;
7847
7848 assert(unit);
7849 assert(context);
7850 assert(ret);
7851
7852 if (!shared && !creds && !exec_needs_ephemeral(context)) {
7853 *ret = NULL;
7854 return 0;
7855 }
7856
7857 if (exec_needs_ephemeral(context)) {
7858 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7859 if (r < 0)
7860 return r;
7861
7862 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7863 if (r < 0)
7864 return r;
7865
7866 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7867 return -errno;
7868 }
7869
7870 rt = new(ExecRuntime, 1);
7871 if (!rt)
7872 return -ENOMEM;
7873
7874 *rt = (ExecRuntime) {
7875 .shared = shared,
7876 .dynamic_creds = creds,
7877 .ephemeral_copy = TAKE_PTR(ephemeral),
7878 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7879 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7880 };
7881
7882 *ret = TAKE_PTR(rt);
7883 return 1;
7884 }
7885
7886 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7887 if (!rt)
7888 return NULL;
7889
7890 exec_shared_runtime_unref(rt->shared);
7891 dynamic_creds_unref(rt->dynamic_creds);
7892
7893 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7894
7895 safe_close_pair(rt->ephemeral_storage_socket);
7896 return mfree(rt);
7897 }
7898
7899 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7900 if (!rt)
7901 return NULL;
7902
7903 rt->shared = exec_shared_runtime_destroy(rt->shared);
7904 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7905 return exec_runtime_free(rt);
7906 }
7907
7908 void exec_params_clear(ExecParameters *p) {
7909 if (!p)
7910 return;
7911
7912 p->environment = strv_free(p->environment);
7913 p->fd_names = strv_free(p->fd_names);
7914 p->fds = mfree(p->fds);
7915 p->exec_fd = safe_close(p->exec_fd);
7916 }
7917
7918 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7919 if (!sc)
7920 return NULL;
7921
7922 free(sc->id);
7923 free(sc->data);
7924 return mfree(sc);
7925 }
7926
7927 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7928 if (!lc)
7929 return NULL;
7930
7931 free(lc->id);
7932 free(lc->path);
7933 return mfree(lc);
7934 }
7935
7936 void exec_directory_done(ExecDirectory *d) {
7937 if (!d)
7938 return;
7939
7940 for (size_t i = 0; i < d->n_items; i++) {
7941 free(d->items[i].path);
7942 strv_free(d->items[i].symlinks);
7943 }
7944
7945 d->items = mfree(d->items);
7946 d->n_items = 0;
7947 d->mode = 0755;
7948 }
7949
7950 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7951 assert(d);
7952 assert(path);
7953
7954 for (size_t i = 0; i < d->n_items; i++)
7955 if (path_equal(d->items[i].path, path))
7956 return &d->items[i];
7957
7958 return NULL;
7959 }
7960
7961 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7962 _cleanup_strv_free_ char **s = NULL;
7963 _cleanup_free_ char *p = NULL;
7964 ExecDirectoryItem *existing;
7965 int r;
7966
7967 assert(d);
7968 assert(path);
7969
7970 existing = exec_directory_find(d, path);
7971 if (existing) {
7972 r = strv_extend(&existing->symlinks, symlink);
7973 if (r < 0)
7974 return r;
7975
7976 return 0; /* existing item is updated */
7977 }
7978
7979 p = strdup(path);
7980 if (!p)
7981 return -ENOMEM;
7982
7983 if (symlink) {
7984 s = strv_new(symlink);
7985 if (!s)
7986 return -ENOMEM;
7987 }
7988
7989 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7990 return -ENOMEM;
7991
7992 d->items[d->n_items++] = (ExecDirectoryItem) {
7993 .path = TAKE_PTR(p),
7994 .symlinks = TAKE_PTR(s),
7995 };
7996
7997 return 1; /* new item is added */
7998 }
7999
8000 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
8001 assert(a);
8002 assert(b);
8003
8004 return path_compare(a->path, b->path);
8005 }
8006
8007 void exec_directory_sort(ExecDirectory *d) {
8008 assert(d);
8009
8010 /* Sort the exec directories to make always parent directories processed at first in
8011 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8012 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8013 * list. See also comments in setup_exec_directory() and issue #24783. */
8014
8015 if (d->n_items <= 1)
8016 return;
8017
8018 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
8019
8020 for (size_t i = 1; i < d->n_items; i++)
8021 for (size_t j = 0; j < i; j++)
8022 if (path_startswith(d->items[i].path, d->items[j].path)) {
8023 d->items[i].only_create = true;
8024 break;
8025 }
8026 }
8027
8028 ExecCleanMask exec_clean_mask_from_string(const char *s) {
8029 ExecDirectoryType t;
8030
8031 assert(s);
8032
8033 if (streq(s, "all"))
8034 return EXEC_CLEAN_ALL;
8035 if (streq(s, "fdstore"))
8036 return EXEC_CLEAN_FDSTORE;
8037
8038 t = exec_resource_type_from_string(s);
8039 if (t < 0)
8040 return (ExecCleanMask) t;
8041
8042 return 1U << t;
8043 }
8044
8045 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
8046 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
8047
8048 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
8049 [EXEC_INPUT_NULL] = "null",
8050 [EXEC_INPUT_TTY] = "tty",
8051 [EXEC_INPUT_TTY_FORCE] = "tty-force",
8052 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
8053 [EXEC_INPUT_SOCKET] = "socket",
8054 [EXEC_INPUT_NAMED_FD] = "fd",
8055 [EXEC_INPUT_DATA] = "data",
8056 [EXEC_INPUT_FILE] = "file",
8057 };
8058
8059 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
8060
8061 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
8062 [EXEC_OUTPUT_INHERIT] = "inherit",
8063 [EXEC_OUTPUT_NULL] = "null",
8064 [EXEC_OUTPUT_TTY] = "tty",
8065 [EXEC_OUTPUT_KMSG] = "kmsg",
8066 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
8067 [EXEC_OUTPUT_JOURNAL] = "journal",
8068 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
8069 [EXEC_OUTPUT_SOCKET] = "socket",
8070 [EXEC_OUTPUT_NAMED_FD] = "fd",
8071 [EXEC_OUTPUT_FILE] = "file",
8072 [EXEC_OUTPUT_FILE_APPEND] = "append",
8073 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
8074 };
8075
8076 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
8077
8078 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
8079 [EXEC_UTMP_INIT] = "init",
8080 [EXEC_UTMP_LOGIN] = "login",
8081 [EXEC_UTMP_USER] = "user",
8082 };
8083
8084 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
8085
8086 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
8087 [EXEC_PRESERVE_NO] = "no",
8088 [EXEC_PRESERVE_YES] = "yes",
8089 [EXEC_PRESERVE_RESTART] = "restart",
8090 };
8091
8092 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
8093
8094 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8095 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8096 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
8097 [EXEC_DIRECTORY_STATE] = "StateDirectory",
8098 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
8099 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
8100 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
8101 };
8102
8103 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
8104
8105 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8106 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8107 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
8108 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
8109 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
8110 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
8111 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
8112 };
8113
8114 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
8115
8116 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8117 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8118 * directories, specifically .timer units with their timestamp touch file. */
8119 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8120 [EXEC_DIRECTORY_RUNTIME] = "runtime",
8121 [EXEC_DIRECTORY_STATE] = "state",
8122 [EXEC_DIRECTORY_CACHE] = "cache",
8123 [EXEC_DIRECTORY_LOGS] = "logs",
8124 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
8125 };
8126
8127 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
8128
8129 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8130 * the service payload in. */
8131 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8132 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
8133 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
8134 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
8135 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
8136 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
8137 };
8138
8139 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
8140
8141 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
8142 [EXEC_KEYRING_INHERIT] = "inherit",
8143 [EXEC_KEYRING_PRIVATE] = "private",
8144 [EXEC_KEYRING_SHARED] = "shared",
8145 };
8146
8147 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);