]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
7cf6601ee7eb8564eddb8f2b2966b18eff1f5c16
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/file.h>
8 #include <sys/ioctl.h>
9 #include <sys/mman.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
20 #if HAVE_PAM
21 #include <security/pam_appl.h>
22 #endif
23
24 #if HAVE_SELINUX
25 #include <selinux/selinux.h>
26 #endif
27
28 #if HAVE_APPARMOR
29 #include <sys/apparmor.h>
30 #endif
31
32 #include "sd-messages.h"
33
34 #include "af-list.h"
35 #include "alloc-util.h"
36 #if HAVE_APPARMOR
37 #include "apparmor-util.h"
38 #endif
39 #include "argv-util.h"
40 #include "async.h"
41 #include "barrier.h"
42 #include "bpf-lsm.h"
43 #include "btrfs-util.h"
44 #include "cap-list.h"
45 #include "capability-util.h"
46 #include "chattr-util.h"
47 #include "cgroup-setup.h"
48 #include "chase.h"
49 #include "chown-recursive.h"
50 #include "constants.h"
51 #include "cpu-set-util.h"
52 #include "data-fd-util.h"
53 #include "env-file.h"
54 #include "env-util.h"
55 #include "errno-list.h"
56 #include "escape.h"
57 #include "exec-credential.h"
58 #include "execute.h"
59 #include "exit-status.h"
60 #include "fd-util.h"
61 #include "format-util.h"
62 #include "glob-util.h"
63 #include "hexdecoct.h"
64 #include "io-util.h"
65 #include "ioprio-util.h"
66 #include "lock-util.h"
67 #include "log.h"
68 #include "macro.h"
69 #include "manager.h"
70 #include "manager-dump.h"
71 #include "memory-util.h"
72 #include "missing_fs.h"
73 #include "missing_ioprio.h"
74 #include "missing_prctl.h"
75 #include "mkdir-label.h"
76 #include "namespace.h"
77 #include "parse-util.h"
78 #include "path-util.h"
79 #include "proc-cmdline.h"
80 #include "process-util.h"
81 #include "psi-util.h"
82 #include "rlimit-util.h"
83 #include "rm-rf.h"
84 #include "seccomp-util.h"
85 #include "securebits-util.h"
86 #include "selinux-util.h"
87 #include "signal-util.h"
88 #include "smack-util.h"
89 #include "socket-util.h"
90 #include "sort-util.h"
91 #include "special.h"
92 #include "stat-util.h"
93 #include "string-table.h"
94 #include "string-util.h"
95 #include "strv.h"
96 #include "syslog-util.h"
97 #include "terminal-util.h"
98 #include "tmpfile-util.h"
99 #include "umask-util.h"
100 #include "unit-serialize.h"
101 #include "user-util.h"
102 #include "utmp-wtmp.h"
103
104 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
105 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
106
107 #define SNDBUF_SIZE (8*1024*1024)
108
109 static int shift_fds(int fds[], size_t n_fds) {
110 if (n_fds <= 0)
111 return 0;
112
113 /* Modifies the fds array! (sorts it) */
114
115 assert(fds);
116
117 for (int start = 0;;) {
118 int restart_from = -1;
119
120 for (int i = start; i < (int) n_fds; i++) {
121 int nfd;
122
123 /* Already at right index? */
124 if (fds[i] == i+3)
125 continue;
126
127 nfd = fcntl(fds[i], F_DUPFD, i + 3);
128 if (nfd < 0)
129 return -errno;
130
131 safe_close(fds[i]);
132 fds[i] = nfd;
133
134 /* Hmm, the fd we wanted isn't free? Then
135 * let's remember that and try again from here */
136 if (nfd != i+3 && restart_from < 0)
137 restart_from = i;
138 }
139
140 if (restart_from < 0)
141 break;
142
143 start = restart_from;
144 }
145
146 return 0;
147 }
148
149 static int flags_fds(
150 const int fds[],
151 size_t n_socket_fds,
152 size_t n_fds,
153 bool nonblock) {
154
155 int r;
156
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
164
165 for (size_t i = 0; i < n_fds; i++) {
166
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
172
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
176
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
179 return r;
180 }
181
182 return 0;
183 }
184
185 static const char *exec_context_tty_path(const ExecContext *context) {
186 assert(context);
187
188 if (context->stdio_as_fds)
189 return NULL;
190
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195 }
196
197 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
198 unsigned rows, cols;
199 const char *tty;
200
201 assert(context);
202 assert(ret_rows);
203 assert(ret_cols);
204
205 rows = context->tty_rows;
206 cols = context->tty_cols;
207
208 tty = exec_context_tty_path(context);
209 if (tty)
210 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
211
212 *ret_rows = rows;
213 *ret_cols = cols;
214
215 return 0;
216 }
217
218 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
219 _cleanup_close_ int fd = -EBADF;
220 const char *path = exec_context_tty_path(ASSERT_PTR(context));
221
222 /* Take a lock around the device for the duration of the setup that we do here.
223 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
224 * We open a new fd that will be closed automatically, and operate on it for convenience.
225 */
226
227 if (p && p->stdin_fd >= 0) {
228 fd = xopenat_lock(p->stdin_fd, NULL,
229 O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
230 if (fd < 0)
231 return;
232 } else if (path) {
233 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
234 if (fd < 0)
235 return;
236
237 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
238 return;
239 } else
240 return; /* nothing to do */
241
242 if (context->tty_vhangup)
243 (void) terminal_vhangup_fd(fd);
244
245 if (context->tty_reset)
246 (void) reset_terminal_fd(fd, true);
247
248 if (p && p->stdin_fd >= 0) {
249 unsigned rows = context->tty_rows, cols = context->tty_cols;
250
251 (void) exec_context_tty_size(context, &rows, &cols);
252 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
253 }
254
255 if (context->tty_vt_disallocate && path)
256 (void) vt_disallocate(path);
257 }
258
259 static bool is_terminal_input(ExecInput i) {
260 return IN_SET(i,
261 EXEC_INPUT_TTY,
262 EXEC_INPUT_TTY_FORCE,
263 EXEC_INPUT_TTY_FAIL);
264 }
265
266 static bool is_terminal_output(ExecOutput o) {
267 return IN_SET(o,
268 EXEC_OUTPUT_TTY,
269 EXEC_OUTPUT_KMSG_AND_CONSOLE,
270 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
271 }
272
273 static bool is_kmsg_output(ExecOutput o) {
274 return IN_SET(o,
275 EXEC_OUTPUT_KMSG,
276 EXEC_OUTPUT_KMSG_AND_CONSOLE);
277 }
278
279 static bool exec_context_needs_term(const ExecContext *c) {
280 assert(c);
281
282 /* Return true if the execution context suggests we should set $TERM to something useful. */
283
284 if (is_terminal_input(c->std_input))
285 return true;
286
287 if (is_terminal_output(c->std_output))
288 return true;
289
290 if (is_terminal_output(c->std_error))
291 return true;
292
293 return !!c->tty_path;
294 }
295
296 static int open_null_as(int flags, int nfd) {
297 int fd;
298
299 assert(nfd >= 0);
300
301 fd = open("/dev/null", flags|O_NOCTTY);
302 if (fd < 0)
303 return -errno;
304
305 return move_fd(fd, nfd, false);
306 }
307
308 static int connect_journal_socket(
309 int fd,
310 const char *log_namespace,
311 uid_t uid,
312 gid_t gid) {
313
314 uid_t olduid = UID_INVALID;
315 gid_t oldgid = GID_INVALID;
316 const char *j;
317 int r;
318
319 j = log_namespace ?
320 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
321 "/run/systemd/journal/stdout";
322
323 if (gid_is_valid(gid)) {
324 oldgid = getgid();
325
326 if (setegid(gid) < 0)
327 return -errno;
328 }
329
330 if (uid_is_valid(uid)) {
331 olduid = getuid();
332
333 if (seteuid(uid) < 0) {
334 r = -errno;
335 goto restore_gid;
336 }
337 }
338
339 r = connect_unix_path(fd, AT_FDCWD, j);
340
341 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
342 an LSM interferes. */
343
344 if (uid_is_valid(uid))
345 (void) seteuid(olduid);
346
347 restore_gid:
348 if (gid_is_valid(gid))
349 (void) setegid(oldgid);
350
351 return r;
352 }
353
354 static int connect_logger_as(
355 const Unit *unit,
356 const ExecContext *context,
357 const ExecParameters *params,
358 ExecOutput output,
359 const char *ident,
360 int nfd,
361 uid_t uid,
362 gid_t gid) {
363
364 _cleanup_close_ int fd = -EBADF;
365 int r;
366
367 assert(context);
368 assert(params);
369 assert(output < _EXEC_OUTPUT_MAX);
370 assert(ident);
371 assert(nfd >= 0);
372
373 fd = socket(AF_UNIX, SOCK_STREAM, 0);
374 if (fd < 0)
375 return -errno;
376
377 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
378 if (r < 0)
379 return r;
380
381 if (shutdown(fd, SHUT_RD) < 0)
382 return -errno;
383
384 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
385
386 if (dprintf(fd,
387 "%s\n"
388 "%s\n"
389 "%i\n"
390 "%i\n"
391 "%i\n"
392 "%i\n"
393 "%i\n",
394 context->syslog_identifier ?: ident,
395 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
396 context->syslog_priority,
397 !!context->syslog_level_prefix,
398 false,
399 is_kmsg_output(output),
400 is_terminal_output(output)) < 0)
401 return -errno;
402
403 return move_fd(TAKE_FD(fd), nfd, false);
404 }
405
406 static int open_terminal_as(const char *path, int flags, int nfd) {
407 int fd;
408
409 assert(path);
410 assert(nfd >= 0);
411
412 fd = open_terminal(path, flags | O_NOCTTY);
413 if (fd < 0)
414 return fd;
415
416 return move_fd(fd, nfd, false);
417 }
418
419 static int acquire_path(const char *path, int flags, mode_t mode) {
420 _cleanup_close_ int fd = -EBADF;
421 int r;
422
423 assert(path);
424
425 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
426 flags |= O_CREAT;
427
428 fd = open(path, flags|O_NOCTTY, mode);
429 if (fd >= 0)
430 return TAKE_FD(fd);
431
432 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
433 return -errno;
434
435 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
436
437 fd = socket(AF_UNIX, SOCK_STREAM, 0);
438 if (fd < 0)
439 return -errno;
440
441 r = connect_unix_path(fd, AT_FDCWD, path);
442 if (IN_SET(r, -ENOTSOCK, -EINVAL))
443 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
444 * wasn't an AF_UNIX socket after all */
445 return -ENXIO;
446 if (r < 0)
447 return r;
448
449 if ((flags & O_ACCMODE) == O_RDONLY)
450 r = shutdown(fd, SHUT_WR);
451 else if ((flags & O_ACCMODE) == O_WRONLY)
452 r = shutdown(fd, SHUT_RD);
453 else
454 r = 0;
455 if (r < 0)
456 return -errno;
457
458 return TAKE_FD(fd);
459 }
460
461 static int fixup_input(
462 const ExecContext *context,
463 int socket_fd,
464 bool apply_tty_stdin) {
465
466 ExecInput std_input;
467
468 assert(context);
469
470 std_input = context->std_input;
471
472 if (is_terminal_input(std_input) && !apply_tty_stdin)
473 return EXEC_INPUT_NULL;
474
475 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
476 return EXEC_INPUT_NULL;
477
478 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
479 return EXEC_INPUT_NULL;
480
481 return std_input;
482 }
483
484 static int fixup_output(ExecOutput output, int socket_fd) {
485
486 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
487 return EXEC_OUTPUT_INHERIT;
488
489 return output;
490 }
491
492 static int setup_input(
493 const ExecContext *context,
494 const ExecParameters *params,
495 int socket_fd,
496 const int named_iofds[static 3]) {
497
498 ExecInput i;
499 int r;
500
501 assert(context);
502 assert(params);
503 assert(named_iofds);
504
505 if (params->stdin_fd >= 0) {
506 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
507 return -errno;
508
509 /* Try to make this the controlling tty, if it is a tty, and reset it */
510 if (isatty(STDIN_FILENO)) {
511 unsigned rows = context->tty_rows, cols = context->tty_cols;
512
513 (void) exec_context_tty_size(context, &rows, &cols);
514 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
515 (void) reset_terminal_fd(STDIN_FILENO, true);
516 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
517 }
518
519 return STDIN_FILENO;
520 }
521
522 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
523
524 switch (i) {
525
526 case EXEC_INPUT_NULL:
527 return open_null_as(O_RDONLY, STDIN_FILENO);
528
529 case EXEC_INPUT_TTY:
530 case EXEC_INPUT_TTY_FORCE:
531 case EXEC_INPUT_TTY_FAIL: {
532 unsigned rows, cols;
533 int fd;
534
535 fd = acquire_terminal(exec_context_tty_path(context),
536 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
537 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
538 ACQUIRE_TERMINAL_WAIT,
539 USEC_INFINITY);
540 if (fd < 0)
541 return fd;
542
543 r = exec_context_tty_size(context, &rows, &cols);
544 if (r < 0)
545 return r;
546
547 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
548 if (r < 0)
549 return r;
550
551 return move_fd(fd, STDIN_FILENO, false);
552 }
553
554 case EXEC_INPUT_SOCKET:
555 assert(socket_fd >= 0);
556
557 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
558
559 case EXEC_INPUT_NAMED_FD:
560 assert(named_iofds[STDIN_FILENO] >= 0);
561
562 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
563 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
564
565 case EXEC_INPUT_DATA: {
566 int fd;
567
568 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
569 if (fd < 0)
570 return fd;
571
572 return move_fd(fd, STDIN_FILENO, false);
573 }
574
575 case EXEC_INPUT_FILE: {
576 bool rw;
577 int fd;
578
579 assert(context->stdio_file[STDIN_FILENO]);
580
581 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
582 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
583
584 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
585 if (fd < 0)
586 return fd;
587
588 return move_fd(fd, STDIN_FILENO, false);
589 }
590
591 default:
592 assert_not_reached();
593 }
594 }
595
596 static bool can_inherit_stderr_from_stdout(
597 const ExecContext *context,
598 ExecOutput o,
599 ExecOutput e) {
600
601 assert(context);
602
603 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
604 * stderr fd */
605
606 if (e == EXEC_OUTPUT_INHERIT)
607 return true;
608 if (e != o)
609 return false;
610
611 if (e == EXEC_OUTPUT_NAMED_FD)
612 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
613
614 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
615 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
616
617 return true;
618 }
619
620 static int setup_output(
621 const Unit *unit,
622 const ExecContext *context,
623 const ExecParameters *params,
624 int fileno,
625 int socket_fd,
626 const int named_iofds[static 3],
627 const char *ident,
628 uid_t uid,
629 gid_t gid,
630 dev_t *journal_stream_dev,
631 ino_t *journal_stream_ino) {
632
633 ExecOutput o;
634 ExecInput i;
635 int r;
636
637 assert(unit);
638 assert(context);
639 assert(params);
640 assert(ident);
641 assert(journal_stream_dev);
642 assert(journal_stream_ino);
643
644 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
645
646 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
647 return -errno;
648
649 return STDOUT_FILENO;
650 }
651
652 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
653 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
654 return -errno;
655
656 return STDERR_FILENO;
657 }
658
659 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
660 o = fixup_output(context->std_output, socket_fd);
661
662 if (fileno == STDERR_FILENO) {
663 ExecOutput e;
664 e = fixup_output(context->std_error, socket_fd);
665
666 /* This expects the input and output are already set up */
667
668 /* Don't change the stderr file descriptor if we inherit all
669 * the way and are not on a tty */
670 if (e == EXEC_OUTPUT_INHERIT &&
671 o == EXEC_OUTPUT_INHERIT &&
672 i == EXEC_INPUT_NULL &&
673 !is_terminal_input(context->std_input) &&
674 getppid() != 1)
675 return fileno;
676
677 /* Duplicate from stdout if possible */
678 if (can_inherit_stderr_from_stdout(context, o, e))
679 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
680
681 o = e;
682
683 } else if (o == EXEC_OUTPUT_INHERIT) {
684 /* If input got downgraded, inherit the original value */
685 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
686 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
687
688 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
689 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
690 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
691
692 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
693 if (getppid() != 1)
694 return fileno;
695
696 /* We need to open /dev/null here anew, to get the right access mode. */
697 return open_null_as(O_WRONLY, fileno);
698 }
699
700 switch (o) {
701
702 case EXEC_OUTPUT_NULL:
703 return open_null_as(O_WRONLY, fileno);
704
705 case EXEC_OUTPUT_TTY:
706 if (is_terminal_input(i))
707 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
708
709 /* We don't reset the terminal if this is just about output */
710 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
711
712 case EXEC_OUTPUT_KMSG:
713 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
714 case EXEC_OUTPUT_JOURNAL:
715 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
716 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
717 if (r < 0) {
718 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
719 fileno == STDOUT_FILENO ? "stdout" : "stderr");
720 r = open_null_as(O_WRONLY, fileno);
721 } else {
722 struct stat st;
723
724 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
725 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
726 * services to detect whether they are connected to the journal or not.
727 *
728 * If both stdout and stderr are connected to a stream then let's make sure to store the data
729 * about STDERR as that's usually the best way to do logging. */
730
731 if (fstat(fileno, &st) >= 0 &&
732 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
733 *journal_stream_dev = st.st_dev;
734 *journal_stream_ino = st.st_ino;
735 }
736 }
737 return r;
738
739 case EXEC_OUTPUT_SOCKET:
740 assert(socket_fd >= 0);
741
742 return RET_NERRNO(dup2(socket_fd, fileno));
743
744 case EXEC_OUTPUT_NAMED_FD:
745 assert(named_iofds[fileno] >= 0);
746
747 (void) fd_nonblock(named_iofds[fileno], false);
748 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
749
750 case EXEC_OUTPUT_FILE:
751 case EXEC_OUTPUT_FILE_APPEND:
752 case EXEC_OUTPUT_FILE_TRUNCATE: {
753 bool rw;
754 int fd, flags;
755
756 assert(context->stdio_file[fileno]);
757
758 rw = context->std_input == EXEC_INPUT_FILE &&
759 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
760
761 if (rw)
762 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
763
764 flags = O_WRONLY;
765 if (o == EXEC_OUTPUT_FILE_APPEND)
766 flags |= O_APPEND;
767 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
768 flags |= O_TRUNC;
769
770 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
771 if (fd < 0)
772 return fd;
773
774 return move_fd(fd, fileno, 0);
775 }
776
777 default:
778 assert_not_reached();
779 }
780 }
781
782 static int chown_terminal(int fd, uid_t uid) {
783 int r;
784
785 assert(fd >= 0);
786
787 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
788 if (isatty(fd) < 1) {
789 if (IN_SET(errno, EINVAL, ENOTTY))
790 return 0; /* not a tty */
791
792 return -errno;
793 }
794
795 /* This might fail. What matters are the results. */
796 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
797 if (r < 0)
798 return r;
799
800 return 1;
801 }
802
803 static int setup_confirm_stdio(
804 const ExecContext *context,
805 const char *vc,
806 int *ret_saved_stdin,
807 int *ret_saved_stdout) {
808
809 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
810 unsigned rows, cols;
811 int r;
812
813 assert(ret_saved_stdin);
814 assert(ret_saved_stdout);
815
816 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
817 if (saved_stdin < 0)
818 return -errno;
819
820 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
821 if (saved_stdout < 0)
822 return -errno;
823
824 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
825 if (fd < 0)
826 return fd;
827
828 r = chown_terminal(fd, getuid());
829 if (r < 0)
830 return r;
831
832 r = reset_terminal_fd(fd, true);
833 if (r < 0)
834 return r;
835
836 r = exec_context_tty_size(context, &rows, &cols);
837 if (r < 0)
838 return r;
839
840 r = terminal_set_size_fd(fd, vc, rows, cols);
841 if (r < 0)
842 return r;
843
844 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
845 TAKE_FD(fd);
846 if (r < 0)
847 return r;
848
849 *ret_saved_stdin = TAKE_FD(saved_stdin);
850 *ret_saved_stdout = TAKE_FD(saved_stdout);
851 return 0;
852 }
853
854 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
855 assert(err < 0);
856
857 if (err == -ETIMEDOUT)
858 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
859 else {
860 errno = -err;
861 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
862 }
863 }
864
865 static void write_confirm_error(int err, const char *vc, const Unit *u) {
866 _cleanup_close_ int fd = -EBADF;
867
868 assert(vc);
869
870 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
871 if (fd < 0)
872 return;
873
874 write_confirm_error_fd(err, fd, u);
875 }
876
877 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
878 int r = 0;
879
880 assert(saved_stdin);
881 assert(saved_stdout);
882
883 release_terminal();
884
885 if (*saved_stdin >= 0)
886 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
887 r = -errno;
888
889 if (*saved_stdout >= 0)
890 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
891 r = -errno;
892
893 *saved_stdin = safe_close(*saved_stdin);
894 *saved_stdout = safe_close(*saved_stdout);
895
896 return r;
897 }
898
899 enum {
900 CONFIRM_PRETEND_FAILURE = -1,
901 CONFIRM_PRETEND_SUCCESS = 0,
902 CONFIRM_EXECUTE = 1,
903 };
904
905 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
906 int saved_stdout = -1, saved_stdin = -1, r;
907 _cleanup_free_ char *e = NULL;
908 char c;
909
910 /* For any internal errors, assume a positive response. */
911 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
912 if (r < 0) {
913 write_confirm_error(r, vc, u);
914 return CONFIRM_EXECUTE;
915 }
916
917 /* confirm_spawn might have been disabled while we were sleeping. */
918 if (manager_is_confirm_spawn_disabled(u->manager)) {
919 r = 1;
920 goto restore_stdio;
921 }
922
923 e = ellipsize(cmdline, 60, 100);
924 if (!e) {
925 log_oom();
926 r = CONFIRM_EXECUTE;
927 goto restore_stdio;
928 }
929
930 for (;;) {
931 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
932 if (r < 0) {
933 write_confirm_error_fd(r, STDOUT_FILENO, u);
934 r = CONFIRM_EXECUTE;
935 goto restore_stdio;
936 }
937
938 switch (c) {
939 case 'c':
940 printf("Resuming normal execution.\n");
941 manager_disable_confirm_spawn();
942 r = 1;
943 break;
944 case 'D':
945 unit_dump(u, stdout, " ");
946 continue; /* ask again */
947 case 'f':
948 printf("Failing execution.\n");
949 r = CONFIRM_PRETEND_FAILURE;
950 break;
951 case 'h':
952 printf(" c - continue, proceed without asking anymore\n"
953 " D - dump, show the state of the unit\n"
954 " f - fail, don't execute the command and pretend it failed\n"
955 " h - help\n"
956 " i - info, show a short summary of the unit\n"
957 " j - jobs, show jobs that are in progress\n"
958 " s - skip, don't execute the command and pretend it succeeded\n"
959 " y - yes, execute the command\n");
960 continue; /* ask again */
961 case 'i':
962 printf(" Description: %s\n"
963 " Unit: %s\n"
964 " Command: %s\n",
965 u->id, u->description, cmdline);
966 continue; /* ask again */
967 case 'j':
968 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
969 continue; /* ask again */
970 case 'n':
971 /* 'n' was removed in favor of 'f'. */
972 printf("Didn't understand 'n', did you mean 'f'?\n");
973 continue; /* ask again */
974 case 's':
975 printf("Skipping execution.\n");
976 r = CONFIRM_PRETEND_SUCCESS;
977 break;
978 case 'y':
979 r = CONFIRM_EXECUTE;
980 break;
981 default:
982 assert_not_reached();
983 }
984 break;
985 }
986
987 restore_stdio:
988 restore_confirm_stdio(&saved_stdin, &saved_stdout);
989 return r;
990 }
991
992 static int get_fixed_user(
993 const char *username,
994 const char **ret_user,
995 uid_t *ret_uid,
996 gid_t *ret_gid,
997 const char **ret_home,
998 const char **ret_shell) {
999
1000 int r;
1001
1002 assert(username);
1003 assert(ret_user);
1004
1005 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1006 * (i.e. are "/" or "/bin/nologin"). */
1007
1008 r = get_user_creds(&username, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
1009 if (r < 0)
1010 return r;
1011
1012 *ret_user = username;
1013 return 0;
1014 }
1015
1016 static int get_fixed_group(
1017 const char *groupname,
1018 const char **ret_group,
1019 gid_t *ret_gid) {
1020
1021 int r;
1022
1023 assert(groupname);
1024 assert(ret_group);
1025
1026 r = get_group_creds(&groupname, ret_gid, /* flags = */ 0);
1027 if (r < 0)
1028 return r;
1029
1030 *ret_group = groupname;
1031 return 0;
1032 }
1033
1034 static int get_supplementary_groups(const ExecContext *c, const char *user,
1035 const char *group, gid_t gid,
1036 gid_t **supplementary_gids, int *ngids) {
1037 int r, k = 0;
1038 int ngroups_max;
1039 bool keep_groups = false;
1040 gid_t *groups = NULL;
1041 _cleanup_free_ gid_t *l_gids = NULL;
1042
1043 assert(c);
1044
1045 /*
1046 * If user is given, then lookup GID and supplementary groups list.
1047 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1048 * here and as early as possible so we keep the list of supplementary
1049 * groups of the caller.
1050 */
1051 if (user && gid_is_valid(gid) && gid != 0) {
1052 /* First step, initialize groups from /etc/groups */
1053 if (initgroups(user, gid) < 0)
1054 return -errno;
1055
1056 keep_groups = true;
1057 }
1058
1059 if (strv_isempty(c->supplementary_groups))
1060 return 0;
1061
1062 /*
1063 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1064 * be positive, otherwise fail.
1065 */
1066 errno = 0;
1067 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1068 if (ngroups_max <= 0)
1069 return errno_or_else(EOPNOTSUPP);
1070
1071 l_gids = new(gid_t, ngroups_max);
1072 if (!l_gids)
1073 return -ENOMEM;
1074
1075 if (keep_groups) {
1076 /*
1077 * Lookup the list of groups that the user belongs to, we
1078 * avoid NSS lookups here too for gid=0.
1079 */
1080 k = ngroups_max;
1081 if (getgrouplist(user, gid, l_gids, &k) < 0)
1082 return -EINVAL;
1083 } else
1084 k = 0;
1085
1086 STRV_FOREACH(i, c->supplementary_groups) {
1087 const char *g;
1088
1089 if (k >= ngroups_max)
1090 return -E2BIG;
1091
1092 g = *i;
1093 r = get_group_creds(&g, l_gids+k, 0);
1094 if (r < 0)
1095 return r;
1096
1097 k++;
1098 }
1099
1100 /*
1101 * Sets ngids to zero to drop all supplementary groups, happens
1102 * when we are under root and SupplementaryGroups= is empty.
1103 */
1104 if (k == 0) {
1105 *ngids = 0;
1106 return 0;
1107 }
1108
1109 /* Otherwise get the final list of supplementary groups */
1110 groups = memdup(l_gids, sizeof(gid_t) * k);
1111 if (!groups)
1112 return -ENOMEM;
1113
1114 *supplementary_gids = groups;
1115 *ngids = k;
1116
1117 groups = NULL;
1118
1119 return 0;
1120 }
1121
1122 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1123 int r;
1124
1125 /* Handle SupplementaryGroups= if it is not empty */
1126 if (ngids > 0) {
1127 r = maybe_setgroups(ngids, supplementary_gids);
1128 if (r < 0)
1129 return r;
1130 }
1131
1132 if (gid_is_valid(gid)) {
1133 /* Then set our gids */
1134 if (setresgid(gid, gid, gid) < 0)
1135 return -errno;
1136 }
1137
1138 return 0;
1139 }
1140
1141 static int set_securebits(unsigned bits, unsigned mask) {
1142 unsigned applied;
1143 int current;
1144
1145 current = prctl(PR_GET_SECUREBITS);
1146 if (current < 0)
1147 return -errno;
1148
1149 /* Clear all securebits defined in mask and set bits */
1150 applied = ((unsigned) current & ~mask) | bits;
1151 if ((unsigned) current == applied)
1152 return 0;
1153
1154 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1155 return -errno;
1156
1157 return 1;
1158 }
1159
1160 static int enforce_user(
1161 const ExecContext *context,
1162 uid_t uid,
1163 uint64_t capability_ambient_set) {
1164 assert(context);
1165 int r;
1166
1167 if (!uid_is_valid(uid))
1168 return 0;
1169
1170 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1171 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1172 * case. */
1173
1174 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1175
1176 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1177 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1178 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1179 if (r < 0)
1180 return r;
1181 }
1182
1183 /* Second step: actually set the uids */
1184 if (setresuid(uid, uid, uid) < 0)
1185 return -errno;
1186
1187 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1188 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1189 * outside of this call. */
1190 return 0;
1191 }
1192
1193 #if HAVE_PAM
1194
1195 static int null_conv(
1196 int num_msg,
1197 const struct pam_message **msg,
1198 struct pam_response **resp,
1199 void *appdata_ptr) {
1200
1201 /* We don't support conversations */
1202
1203 return PAM_CONV_ERR;
1204 }
1205
1206 #endif
1207
1208 static int setup_pam(
1209 const char *name,
1210 const char *user,
1211 uid_t uid,
1212 gid_t gid,
1213 const char *tty,
1214 char ***env, /* updated on success */
1215 const int fds[], size_t n_fds) {
1216
1217 #if HAVE_PAM
1218
1219 static const struct pam_conv conv = {
1220 .conv = null_conv,
1221 .appdata_ptr = NULL
1222 };
1223
1224 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1225 _cleanup_strv_free_ char **e = NULL;
1226 pam_handle_t *handle = NULL;
1227 sigset_t old_ss;
1228 int pam_code = PAM_SUCCESS, r;
1229 bool close_session = false;
1230 pid_t pam_pid = 0, parent_pid;
1231 int flags = 0;
1232
1233 assert(name);
1234 assert(user);
1235 assert(env);
1236
1237 /* We set up PAM in the parent process, then fork. The child
1238 * will then stay around until killed via PR_GET_PDEATHSIG or
1239 * systemd via the cgroup logic. It will then remove the PAM
1240 * session again. The parent process will exec() the actual
1241 * daemon. We do things this way to ensure that the main PID
1242 * of the daemon is the one we initially fork()ed. */
1243
1244 r = barrier_create(&barrier);
1245 if (r < 0)
1246 goto fail;
1247
1248 if (log_get_max_level() < LOG_DEBUG)
1249 flags |= PAM_SILENT;
1250
1251 pam_code = pam_start(name, user, &conv, &handle);
1252 if (pam_code != PAM_SUCCESS) {
1253 handle = NULL;
1254 goto fail;
1255 }
1256
1257 if (!tty) {
1258 _cleanup_free_ char *q = NULL;
1259
1260 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1261 * out if that's the case, and read the TTY off it. */
1262
1263 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1264 tty = strjoina("/dev/", q);
1265 }
1266
1267 if (tty) {
1268 pam_code = pam_set_item(handle, PAM_TTY, tty);
1269 if (pam_code != PAM_SUCCESS)
1270 goto fail;
1271 }
1272
1273 STRV_FOREACH(nv, *env) {
1274 pam_code = pam_putenv(handle, *nv);
1275 if (pam_code != PAM_SUCCESS)
1276 goto fail;
1277 }
1278
1279 pam_code = pam_acct_mgmt(handle, flags);
1280 if (pam_code != PAM_SUCCESS)
1281 goto fail;
1282
1283 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1284 if (pam_code != PAM_SUCCESS)
1285 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1286
1287 pam_code = pam_open_session(handle, flags);
1288 if (pam_code != PAM_SUCCESS)
1289 goto fail;
1290
1291 close_session = true;
1292
1293 e = pam_getenvlist(handle);
1294 if (!e) {
1295 pam_code = PAM_BUF_ERR;
1296 goto fail;
1297 }
1298
1299 /* Block SIGTERM, so that we know that it won't get lost in the child */
1300
1301 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1302
1303 parent_pid = getpid_cached();
1304
1305 r = safe_fork("(sd-pam)", 0, &pam_pid);
1306 if (r < 0)
1307 goto fail;
1308 if (r == 0) {
1309 int sig, ret = EXIT_PAM;
1310
1311 /* The child's job is to reset the PAM session on termination */
1312 barrier_set_role(&barrier, BARRIER_CHILD);
1313
1314 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1315 * those fds are open here that have been opened by PAM. */
1316 (void) close_many(fds, n_fds);
1317
1318 /* Drop privileges - we don't need any to pam_close_session and this will make
1319 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1320 * threads to fail to exit normally */
1321
1322 r = maybe_setgroups(0, NULL);
1323 if (r < 0)
1324 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1325 if (setresgid(gid, gid, gid) < 0)
1326 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1327 if (setresuid(uid, uid, uid) < 0)
1328 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1329
1330 (void) ignore_signals(SIGPIPE);
1331
1332 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1333 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1334 * this way. We rely on the control groups kill logic to do the rest for us. */
1335 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1336 goto child_finish;
1337
1338 /* Tell the parent that our setup is done. This is especially important regarding dropping
1339 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1340 *
1341 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1342 (void) barrier_place(&barrier);
1343
1344 /* Check if our parent process might already have died? */
1345 if (getppid() == parent_pid) {
1346 sigset_t ss;
1347
1348 assert_se(sigemptyset(&ss) >= 0);
1349 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1350
1351 for (;;) {
1352 if (sigwait(&ss, &sig) < 0) {
1353 if (errno == EINTR)
1354 continue;
1355
1356 goto child_finish;
1357 }
1358
1359 assert(sig == SIGTERM);
1360 break;
1361 }
1362 }
1363
1364 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1365 if (pam_code != PAM_SUCCESS)
1366 goto child_finish;
1367
1368 /* If our parent died we'll end the session */
1369 if (getppid() != parent_pid) {
1370 pam_code = pam_close_session(handle, flags);
1371 if (pam_code != PAM_SUCCESS)
1372 goto child_finish;
1373 }
1374
1375 ret = 0;
1376
1377 child_finish:
1378 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1379 * know about this. See pam_end(3) */
1380 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1381 _exit(ret);
1382 }
1383
1384 barrier_set_role(&barrier, BARRIER_PARENT);
1385
1386 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1387 * here. */
1388 handle = NULL;
1389
1390 /* Unblock SIGTERM again in the parent */
1391 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1392
1393 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1394 * this fd around. */
1395 closelog();
1396
1397 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1398 * recover. However, warn loudly if it happens. */
1399 if (!barrier_place_and_sync(&barrier))
1400 log_error("PAM initialization failed");
1401
1402 return strv_free_and_replace(*env, e);
1403
1404 fail:
1405 if (pam_code != PAM_SUCCESS) {
1406 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1407 r = -EPERM; /* PAM errors do not map to errno */
1408 } else
1409 log_error_errno(r, "PAM failed: %m");
1410
1411 if (handle) {
1412 if (close_session)
1413 pam_code = pam_close_session(handle, flags);
1414
1415 (void) pam_end(handle, pam_code | flags);
1416 }
1417
1418 closelog();
1419 return r;
1420 #else
1421 return 0;
1422 #endif
1423 }
1424
1425 static void rename_process_from_path(const char *path) {
1426 _cleanup_free_ char *buf = NULL;
1427 const char *p;
1428
1429 assert(path);
1430
1431 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1432 * /bin/ps */
1433
1434 if (path_extract_filename(path, &buf) < 0) {
1435 rename_process("(...)");
1436 return;
1437 }
1438
1439 size_t l = strlen(buf);
1440 if (l > 8) {
1441 /* The end of the process name is usually more interesting, since the first bit might just be
1442 * "systemd-" */
1443 p = buf + l - 8;
1444 l = 8;
1445 } else
1446 p = buf;
1447
1448 char process_name[11];
1449 process_name[0] = '(';
1450 memcpy(process_name+1, p, l);
1451 process_name[1+l] = ')';
1452 process_name[1+l+1] = 0;
1453
1454 rename_process(process_name);
1455 }
1456
1457 static bool context_has_address_families(const ExecContext *c) {
1458 assert(c);
1459
1460 return c->address_families_allow_list ||
1461 !set_isempty(c->address_families);
1462 }
1463
1464 static bool context_has_syscall_filters(const ExecContext *c) {
1465 assert(c);
1466
1467 return c->syscall_allow_list ||
1468 !hashmap_isempty(c->syscall_filter);
1469 }
1470
1471 static bool context_has_syscall_logs(const ExecContext *c) {
1472 assert(c);
1473
1474 return c->syscall_log_allow_list ||
1475 !hashmap_isempty(c->syscall_log);
1476 }
1477
1478 static bool context_has_no_new_privileges(const ExecContext *c) {
1479 assert(c);
1480
1481 if (c->no_new_privileges)
1482 return true;
1483
1484 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1485 return false;
1486
1487 /* We need NNP if we have any form of seccomp and are unprivileged */
1488 return c->lock_personality ||
1489 c->memory_deny_write_execute ||
1490 c->private_devices ||
1491 c->protect_clock ||
1492 c->protect_hostname ||
1493 c->protect_kernel_tunables ||
1494 c->protect_kernel_modules ||
1495 c->protect_kernel_logs ||
1496 context_has_address_families(c) ||
1497 exec_context_restrict_namespaces_set(c) ||
1498 c->restrict_realtime ||
1499 c->restrict_suid_sgid ||
1500 !set_isempty(c->syscall_archs) ||
1501 context_has_syscall_filters(c) ||
1502 context_has_syscall_logs(c);
1503 }
1504
1505 #if HAVE_SECCOMP
1506
1507 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1508
1509 if (is_seccomp_available())
1510 return false;
1511
1512 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1513 return true;
1514 }
1515
1516 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1517 uint32_t negative_action, default_action, action;
1518 int r;
1519
1520 assert(u);
1521 assert(c);
1522
1523 if (!context_has_syscall_filters(c))
1524 return 0;
1525
1526 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1527 return 0;
1528
1529 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1530
1531 if (c->syscall_allow_list) {
1532 default_action = negative_action;
1533 action = SCMP_ACT_ALLOW;
1534 } else {
1535 default_action = SCMP_ACT_ALLOW;
1536 action = negative_action;
1537 }
1538
1539 if (needs_ambient_hack) {
1540 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1541 if (r < 0)
1542 return r;
1543 }
1544
1545 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1546 }
1547
1548 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1549 #ifdef SCMP_ACT_LOG
1550 uint32_t default_action, action;
1551 #endif
1552
1553 assert(u);
1554 assert(c);
1555
1556 if (!context_has_syscall_logs(c))
1557 return 0;
1558
1559 #ifdef SCMP_ACT_LOG
1560 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1561 return 0;
1562
1563 if (c->syscall_log_allow_list) {
1564 /* Log nothing but the ones listed */
1565 default_action = SCMP_ACT_ALLOW;
1566 action = SCMP_ACT_LOG;
1567 } else {
1568 /* Log everything but the ones listed */
1569 default_action = SCMP_ACT_LOG;
1570 action = SCMP_ACT_ALLOW;
1571 }
1572
1573 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1574 #else
1575 /* old libseccomp */
1576 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1577 return 0;
1578 #endif
1579 }
1580
1581 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1582 assert(u);
1583 assert(c);
1584
1585 if (set_isempty(c->syscall_archs))
1586 return 0;
1587
1588 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1589 return 0;
1590
1591 return seccomp_restrict_archs(c->syscall_archs);
1592 }
1593
1594 static int apply_address_families(const Unit* u, const ExecContext *c) {
1595 assert(u);
1596 assert(c);
1597
1598 if (!context_has_address_families(c))
1599 return 0;
1600
1601 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1602 return 0;
1603
1604 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1605 }
1606
1607 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1608 int r;
1609
1610 assert(u);
1611 assert(c);
1612
1613 if (!c->memory_deny_write_execute)
1614 return 0;
1615
1616 /* use prctl() if kernel supports it (6.3) */
1617 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1618 if (r == 0) {
1619 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1620 return 0;
1621 }
1622 if (r < 0 && errno != EINVAL)
1623 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1624 /* else use seccomp */
1625 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1626
1627 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1628 return 0;
1629
1630 return seccomp_memory_deny_write_execute();
1631 }
1632
1633 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1634 assert(u);
1635 assert(c);
1636
1637 if (!c->restrict_realtime)
1638 return 0;
1639
1640 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1641 return 0;
1642
1643 return seccomp_restrict_realtime();
1644 }
1645
1646 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1647 assert(u);
1648 assert(c);
1649
1650 if (!c->restrict_suid_sgid)
1651 return 0;
1652
1653 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1654 return 0;
1655
1656 return seccomp_restrict_suid_sgid();
1657 }
1658
1659 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1660 assert(u);
1661 assert(c);
1662
1663 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1664 * let's protect even those systems where this is left on in the kernel. */
1665
1666 if (!c->protect_kernel_tunables)
1667 return 0;
1668
1669 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1670 return 0;
1671
1672 return seccomp_protect_sysctl();
1673 }
1674
1675 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1676 assert(u);
1677 assert(c);
1678
1679 /* Turn off module syscalls on ProtectKernelModules=yes */
1680
1681 if (!c->protect_kernel_modules)
1682 return 0;
1683
1684 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1685 return 0;
1686
1687 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1688 }
1689
1690 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1691 assert(u);
1692 assert(c);
1693
1694 if (!c->protect_kernel_logs)
1695 return 0;
1696
1697 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1698 return 0;
1699
1700 return seccomp_protect_syslog();
1701 }
1702
1703 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1704 assert(u);
1705 assert(c);
1706
1707 if (!c->protect_clock)
1708 return 0;
1709
1710 if (skip_seccomp_unavailable(u, "ProtectClock="))
1711 return 0;
1712
1713 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1714 }
1715
1716 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1717 assert(u);
1718 assert(c);
1719
1720 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1721
1722 if (!c->private_devices)
1723 return 0;
1724
1725 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1726 return 0;
1727
1728 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1729 }
1730
1731 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1732 assert(u);
1733 assert(c);
1734
1735 if (!exec_context_restrict_namespaces_set(c))
1736 return 0;
1737
1738 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1739 return 0;
1740
1741 return seccomp_restrict_namespaces(c->restrict_namespaces);
1742 }
1743
1744 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1745 unsigned long personality;
1746 int r;
1747
1748 assert(u);
1749 assert(c);
1750
1751 if (!c->lock_personality)
1752 return 0;
1753
1754 if (skip_seccomp_unavailable(u, "LockPersonality="))
1755 return 0;
1756
1757 personality = c->personality;
1758
1759 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1760 if (personality == PERSONALITY_INVALID) {
1761
1762 r = opinionated_personality(&personality);
1763 if (r < 0)
1764 return r;
1765 }
1766
1767 return seccomp_lock_personality(personality);
1768 }
1769
1770 #endif
1771
1772 #if HAVE_LIBBPF
1773 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1774 assert(u);
1775 assert(c);
1776
1777 if (!exec_context_restrict_filesystems_set(c))
1778 return 0;
1779
1780 if (!u->manager->restrict_fs) {
1781 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1782 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1783 return 0;
1784 }
1785
1786 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1787 }
1788 #endif
1789
1790 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1791 assert(u);
1792 assert(c);
1793
1794 if (!c->protect_hostname)
1795 return 0;
1796
1797 if (ns_type_supported(NAMESPACE_UTS)) {
1798 if (unshare(CLONE_NEWUTS) < 0) {
1799 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1800 *ret_exit_status = EXIT_NAMESPACE;
1801 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1802 }
1803
1804 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1805 }
1806 } else
1807 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1808
1809 #if HAVE_SECCOMP
1810 int r;
1811
1812 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1813 return 0;
1814
1815 r = seccomp_protect_hostname();
1816 if (r < 0) {
1817 *ret_exit_status = EXIT_SECCOMP;
1818 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1819 }
1820 #endif
1821
1822 return 0;
1823 }
1824
1825 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1826 assert(idle_pipe);
1827
1828 idle_pipe[1] = safe_close(idle_pipe[1]);
1829 idle_pipe[2] = safe_close(idle_pipe[2]);
1830
1831 if (idle_pipe[0] >= 0) {
1832 int r;
1833
1834 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1835
1836 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1837 ssize_t n;
1838
1839 /* Signal systemd that we are bored and want to continue. */
1840 n = write(idle_pipe[3], "x", 1);
1841 if (n > 0)
1842 /* Wait for systemd to react to the signal above. */
1843 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1844 }
1845
1846 idle_pipe[0] = safe_close(idle_pipe[0]);
1847
1848 }
1849
1850 idle_pipe[3] = safe_close(idle_pipe[3]);
1851 }
1852
1853 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1854
1855 static int build_environment(
1856 const Unit *u,
1857 const ExecContext *c,
1858 const ExecParameters *p,
1859 const CGroupContext *cgroup_context,
1860 size_t n_fds,
1861 char **fdnames,
1862 const char *home,
1863 const char *username,
1864 const char *shell,
1865 dev_t journal_stream_dev,
1866 ino_t journal_stream_ino,
1867 const char *memory_pressure_path,
1868 char ***ret) {
1869
1870 _cleanup_strv_free_ char **our_env = NULL;
1871 size_t n_env = 0;
1872 char *x;
1873 int r;
1874
1875 assert(u);
1876 assert(c);
1877 assert(p);
1878 assert(ret);
1879
1880 #define N_ENV_VARS 19
1881 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1882 if (!our_env)
1883 return -ENOMEM;
1884
1885 if (n_fds > 0) {
1886 _cleanup_free_ char *joined = NULL;
1887
1888 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1889 return -ENOMEM;
1890 our_env[n_env++] = x;
1891
1892 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1893 return -ENOMEM;
1894 our_env[n_env++] = x;
1895
1896 joined = strv_join(fdnames, ":");
1897 if (!joined)
1898 return -ENOMEM;
1899
1900 x = strjoin("LISTEN_FDNAMES=", joined);
1901 if (!x)
1902 return -ENOMEM;
1903 our_env[n_env++] = x;
1904 }
1905
1906 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1907 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1908 return -ENOMEM;
1909 our_env[n_env++] = x;
1910
1911 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1912 return -ENOMEM;
1913 our_env[n_env++] = x;
1914 }
1915
1916 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1917 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1918 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1919 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1920 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1921 if (!x)
1922 return -ENOMEM;
1923 our_env[n_env++] = x;
1924 }
1925
1926 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1927 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1928 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1929 * SetLoginEnvironment= switch. */
1930 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1931 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1932 if (r < 0)
1933 return log_unit_error_errno(u, r, "Failed to determine user credentials for root: %m");
1934 }
1935
1936 bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
1937
1938 if (username) {
1939 x = strjoin("USER=", username);
1940 if (!x)
1941 return -ENOMEM;
1942 our_env[n_env++] = x;
1943
1944 if (set_user_login_env) {
1945 x = strjoin("LOGNAME=", username);
1946 if (!x)
1947 return -ENOMEM;
1948 our_env[n_env++] = x;
1949 }
1950 }
1951
1952 if (home && set_user_login_env) {
1953 x = strjoin("HOME=", home);
1954 if (!x)
1955 return -ENOMEM;
1956
1957 path_simplify(x + 5);
1958 our_env[n_env++] = x;
1959 }
1960
1961 if (shell && set_user_login_env) {
1962 x = strjoin("SHELL=", shell);
1963 if (!x)
1964 return -ENOMEM;
1965
1966 path_simplify(x + 6);
1967 our_env[n_env++] = x;
1968 }
1969
1970 if (!sd_id128_is_null(u->invocation_id)) {
1971 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1972 return -ENOMEM;
1973
1974 our_env[n_env++] = x;
1975 }
1976
1977 if (exec_context_needs_term(c)) {
1978 _cleanup_free_ char *cmdline = NULL;
1979 const char *tty_path, *term = NULL;
1980
1981 tty_path = exec_context_tty_path(c);
1982
1983 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1984 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1985 * container manager passes to PID 1 ends up all the way in the console login shown. */
1986
1987 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1988 term = getenv("TERM");
1989 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1990 _cleanup_free_ char *key = NULL;
1991
1992 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1993 if (!key)
1994 return -ENOMEM;
1995
1996 r = proc_cmdline_get_key(key, 0, &cmdline);
1997 if (r < 0)
1998 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
1999 else if (r > 0)
2000 term = cmdline;
2001 }
2002
2003 if (!term)
2004 term = default_term_for_tty(tty_path);
2005
2006 x = strjoin("TERM=", term);
2007 if (!x)
2008 return -ENOMEM;
2009 our_env[n_env++] = x;
2010 }
2011
2012 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2013 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2014 return -ENOMEM;
2015
2016 our_env[n_env++] = x;
2017 }
2018
2019 if (c->log_namespace) {
2020 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2021 if (!x)
2022 return -ENOMEM;
2023
2024 our_env[n_env++] = x;
2025 }
2026
2027 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2028 _cleanup_free_ char *joined = NULL;
2029 const char *n;
2030
2031 if (!p->prefix[t])
2032 continue;
2033
2034 if (c->directories[t].n_items == 0)
2035 continue;
2036
2037 n = exec_directory_env_name_to_string(t);
2038 if (!n)
2039 continue;
2040
2041 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2042 _cleanup_free_ char *prefixed = NULL;
2043
2044 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2045 if (!prefixed)
2046 return -ENOMEM;
2047
2048 if (!strextend_with_separator(&joined, ":", prefixed))
2049 return -ENOMEM;
2050 }
2051
2052 x = strjoin(n, "=", joined);
2053 if (!x)
2054 return -ENOMEM;
2055
2056 our_env[n_env++] = x;
2057 }
2058
2059 _cleanup_free_ char *creds_dir = NULL;
2060 r = exec_context_get_credential_directory(c, p, u->id, &creds_dir);
2061 if (r < 0)
2062 return r;
2063 if (r > 0) {
2064 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2065 if (!x)
2066 return -ENOMEM;
2067
2068 our_env[n_env++] = x;
2069 }
2070
2071 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2072 return -ENOMEM;
2073
2074 our_env[n_env++] = x;
2075
2076 if (memory_pressure_path) {
2077 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2078 if (!x)
2079 return -ENOMEM;
2080
2081 our_env[n_env++] = x;
2082
2083 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2084 _cleanup_free_ char *b = NULL, *e = NULL;
2085
2086 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2087 MEMORY_PRESSURE_DEFAULT_TYPE,
2088 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2089 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2090 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2091 return -ENOMEM;
2092
2093 if (base64mem(b, strlen(b) + 1, &e) < 0)
2094 return -ENOMEM;
2095
2096 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2097 if (!x)
2098 return -ENOMEM;
2099
2100 our_env[n_env++] = x;
2101 }
2102 }
2103
2104 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2105 #undef N_ENV_VARS
2106
2107 *ret = TAKE_PTR(our_env);
2108
2109 return 0;
2110 }
2111
2112 static int build_pass_environment(const ExecContext *c, char ***ret) {
2113 _cleanup_strv_free_ char **pass_env = NULL;
2114 size_t n_env = 0;
2115
2116 STRV_FOREACH(i, c->pass_environment) {
2117 _cleanup_free_ char *x = NULL;
2118 char *v;
2119
2120 v = getenv(*i);
2121 if (!v)
2122 continue;
2123 x = strjoin(*i, "=", v);
2124 if (!x)
2125 return -ENOMEM;
2126
2127 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2128 return -ENOMEM;
2129
2130 pass_env[n_env++] = TAKE_PTR(x);
2131 pass_env[n_env] = NULL;
2132 }
2133
2134 *ret = TAKE_PTR(pass_env);
2135
2136 return 0;
2137 }
2138
2139 bool exec_needs_network_namespace(const ExecContext *context) {
2140 assert(context);
2141
2142 return context->private_network || context->network_namespace_path;
2143 }
2144
2145 static bool exec_needs_ephemeral(const ExecContext *context) {
2146 return (context->root_image || context->root_directory) && context->root_ephemeral;
2147 }
2148
2149 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2150 assert(context);
2151
2152 return context->private_ipc || context->ipc_namespace_path;
2153 }
2154
2155 bool exec_needs_mount_namespace(
2156 const ExecContext *context,
2157 const ExecParameters *params,
2158 const ExecRuntime *runtime) {
2159
2160 assert(context);
2161
2162 if (context->root_image)
2163 return true;
2164
2165 if (!strv_isempty(context->read_write_paths) ||
2166 !strv_isempty(context->read_only_paths) ||
2167 !strv_isempty(context->inaccessible_paths) ||
2168 !strv_isempty(context->exec_paths) ||
2169 !strv_isempty(context->no_exec_paths))
2170 return true;
2171
2172 if (context->n_bind_mounts > 0)
2173 return true;
2174
2175 if (context->n_temporary_filesystems > 0)
2176 return true;
2177
2178 if (context->n_mount_images > 0)
2179 return true;
2180
2181 if (context->n_extension_images > 0)
2182 return true;
2183
2184 if (!strv_isempty(context->extension_directories))
2185 return true;
2186
2187 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2188 return true;
2189
2190 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2191 return true;
2192
2193 if (context->private_devices ||
2194 context->private_mounts > 0 ||
2195 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2196 context->protect_system != PROTECT_SYSTEM_NO ||
2197 context->protect_home != PROTECT_HOME_NO ||
2198 context->protect_kernel_tunables ||
2199 context->protect_kernel_modules ||
2200 context->protect_kernel_logs ||
2201 context->protect_control_groups ||
2202 context->protect_proc != PROTECT_PROC_DEFAULT ||
2203 context->proc_subset != PROC_SUBSET_ALL ||
2204 exec_needs_ipc_namespace(context))
2205 return true;
2206
2207 if (context->root_directory) {
2208 if (exec_context_get_effective_mount_apivfs(context))
2209 return true;
2210
2211 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2212 if (params && !params->prefix[t])
2213 continue;
2214
2215 if (context->directories[t].n_items > 0)
2216 return true;
2217 }
2218 }
2219
2220 if (context->dynamic_user &&
2221 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2222 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2223 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2224 return true;
2225
2226 if (context->log_namespace)
2227 return true;
2228
2229 return false;
2230 }
2231
2232 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2233 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2234 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2235 _cleanup_close_ int unshare_ready_fd = -EBADF;
2236 _cleanup_(sigkill_waitp) pid_t pid = 0;
2237 uint64_t c = 1;
2238 ssize_t n;
2239 int r;
2240
2241 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2242 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2243 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2244 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2245 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2246 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2247 * continues execution normally.
2248 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2249 * does not need CAP_SETUID to write the single line mapping to itself. */
2250
2251 /* Can only set up multiple mappings with CAP_SETUID. */
2252 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2253 r = asprintf(&uid_map,
2254 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2255 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2256 ouid, ouid, uid, uid);
2257 else
2258 r = asprintf(&uid_map,
2259 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2260 ouid, ouid);
2261
2262 if (r < 0)
2263 return -ENOMEM;
2264
2265 /* Can only set up multiple mappings with CAP_SETGID. */
2266 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2267 r = asprintf(&gid_map,
2268 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2269 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2270 ogid, ogid, gid, gid);
2271 else
2272 r = asprintf(&gid_map,
2273 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2274 ogid, ogid);
2275
2276 if (r < 0)
2277 return -ENOMEM;
2278
2279 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2280 * namespace. */
2281 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2282 if (unshare_ready_fd < 0)
2283 return -errno;
2284
2285 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2286 * failed. */
2287 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2288 return -errno;
2289
2290 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2291 if (r < 0)
2292 return r;
2293 if (r == 0) {
2294 _cleanup_close_ int fd = -EBADF;
2295 const char *a;
2296 pid_t ppid;
2297
2298 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2299 * here, after the parent opened its own user namespace. */
2300
2301 ppid = getppid();
2302 errno_pipe[0] = safe_close(errno_pipe[0]);
2303
2304 /* Wait until the parent unshared the user namespace */
2305 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2306 r = -errno;
2307 goto child_fail;
2308 }
2309
2310 /* Disable the setgroups() system call in the child user namespace, for good. */
2311 a = procfs_file_alloca(ppid, "setgroups");
2312 fd = open(a, O_WRONLY|O_CLOEXEC);
2313 if (fd < 0) {
2314 if (errno != ENOENT) {
2315 r = -errno;
2316 goto child_fail;
2317 }
2318
2319 /* If the file is missing the kernel is too old, let's continue anyway. */
2320 } else {
2321 if (write(fd, "deny\n", 5) < 0) {
2322 r = -errno;
2323 goto child_fail;
2324 }
2325
2326 fd = safe_close(fd);
2327 }
2328
2329 /* First write the GID map */
2330 a = procfs_file_alloca(ppid, "gid_map");
2331 fd = open(a, O_WRONLY|O_CLOEXEC);
2332 if (fd < 0) {
2333 r = -errno;
2334 goto child_fail;
2335 }
2336 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2337 r = -errno;
2338 goto child_fail;
2339 }
2340 fd = safe_close(fd);
2341
2342 /* The write the UID map */
2343 a = procfs_file_alloca(ppid, "uid_map");
2344 fd = open(a, O_WRONLY|O_CLOEXEC);
2345 if (fd < 0) {
2346 r = -errno;
2347 goto child_fail;
2348 }
2349 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2350 r = -errno;
2351 goto child_fail;
2352 }
2353
2354 _exit(EXIT_SUCCESS);
2355
2356 child_fail:
2357 (void) write(errno_pipe[1], &r, sizeof(r));
2358 _exit(EXIT_FAILURE);
2359 }
2360
2361 errno_pipe[1] = safe_close(errno_pipe[1]);
2362
2363 if (unshare(CLONE_NEWUSER) < 0)
2364 return -errno;
2365
2366 /* Let the child know that the namespace is ready now */
2367 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2368 return -errno;
2369
2370 /* Try to read an error code from the child */
2371 n = read(errno_pipe[0], &r, sizeof(r));
2372 if (n < 0)
2373 return -errno;
2374 if (n == sizeof(r)) { /* an error code was sent to us */
2375 if (r < 0)
2376 return r;
2377 return -EIO;
2378 }
2379 if (n != 0) /* on success we should have read 0 bytes */
2380 return -EIO;
2381
2382 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2383 if (r < 0)
2384 return r;
2385 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2386 return -EIO;
2387
2388 return 0;
2389 }
2390
2391 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2392 assert(context);
2393
2394 if (!context->dynamic_user)
2395 return false;
2396
2397 if (type == EXEC_DIRECTORY_CONFIGURATION)
2398 return false;
2399
2400 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2401 return false;
2402
2403 return true;
2404 }
2405
2406 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2407 _cleanup_free_ char *src_abs = NULL;
2408 int r;
2409
2410 assert(source);
2411
2412 src_abs = path_join(root, source);
2413 if (!src_abs)
2414 return -ENOMEM;
2415
2416 STRV_FOREACH(dst, symlinks) {
2417 _cleanup_free_ char *dst_abs = NULL;
2418
2419 dst_abs = path_join(root, *dst);
2420 if (!dst_abs)
2421 return -ENOMEM;
2422
2423 r = mkdir_parents_label(dst_abs, 0755);
2424 if (r < 0)
2425 return r;
2426
2427 r = symlink_idempotent(src_abs, dst_abs, true);
2428 if (r < 0)
2429 return r;
2430 }
2431
2432 return 0;
2433 }
2434
2435 static int setup_exec_directory(
2436 Unit *u,
2437 const ExecContext *context,
2438 const ExecParameters *params,
2439 uid_t uid,
2440 gid_t gid,
2441 ExecDirectoryType type,
2442 bool needs_mount_namespace,
2443 int *exit_status) {
2444
2445 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2446 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2447 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2448 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2449 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2450 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2451 };
2452 int r;
2453
2454 assert(context);
2455 assert(params);
2456 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2457 assert(exit_status);
2458
2459 if (!params->prefix[type])
2460 return 0;
2461
2462 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2463 if (!uid_is_valid(uid))
2464 uid = 0;
2465 if (!gid_is_valid(gid))
2466 gid = 0;
2467 }
2468
2469 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2470 _cleanup_free_ char *p = NULL, *pp = NULL;
2471
2472 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2473 if (!p) {
2474 r = -ENOMEM;
2475 goto fail;
2476 }
2477
2478 r = mkdir_parents_label(p, 0755);
2479 if (r < 0)
2480 goto fail;
2481
2482 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2483
2484 /* If we are in user mode, and a configuration directory exists but a state directory
2485 * doesn't exist, then we likely are upgrading from an older systemd version that
2486 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2487 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2488 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2489 * separated. If a service has both dirs configured but only the configuration dir
2490 * exists and the state dir does not, we assume we are looking at an update
2491 * situation. Hence, create a compatibility symlink, so that all expectations are
2492 * met.
2493 *
2494 * (We also do something similar with the log directory, which still doesn't exist in
2495 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2496
2497 /* this assumes the state dir is always created before the configuration dir */
2498 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2499 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2500
2501 r = laccess(p, F_OK);
2502 if (r == -ENOENT) {
2503 _cleanup_free_ char *q = NULL;
2504
2505 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2506 * under the configuration hierarchy. */
2507
2508 if (type == EXEC_DIRECTORY_STATE)
2509 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2510 else if (type == EXEC_DIRECTORY_LOGS)
2511 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2512 else
2513 assert_not_reached();
2514 if (!q) {
2515 r = -ENOMEM;
2516 goto fail;
2517 }
2518
2519 r = laccess(q, F_OK);
2520 if (r >= 0) {
2521 /* It does exist! This hence looks like an update. Symlink the
2522 * configuration directory into the state directory. */
2523
2524 r = symlink_idempotent(q, p, /* make_relative= */ true);
2525 if (r < 0)
2526 goto fail;
2527
2528 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2529 continue;
2530 } else if (r != -ENOENT)
2531 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2532
2533 } else if (r < 0)
2534 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2535 }
2536
2537 if (exec_directory_is_private(context, type)) {
2538 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2539 * case we want to avoid leaving a directory around fully accessible that is owned by
2540 * a dynamic user whose UID is later on reused. To lock this down we use the same
2541 * trick used by container managers to prohibit host users to get access to files of
2542 * the same UID in containers: we place everything inside a directory that has an
2543 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2544 * for unprivileged host code. We then use fs namespacing to make this directory
2545 * permeable for the service itself.
2546 *
2547 * Specifically: for a service which wants a special directory "foo/" we first create
2548 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2549 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2550 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2551 * unprivileged host users can't look into it. Inside of the namespace of the unit
2552 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2553 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2554 * for the service and making sure it only gets access to the dirs it needs but no
2555 * others. Tricky? Yes, absolutely, but it works!
2556 *
2557 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2558 * to be owned by the service itself.
2559 *
2560 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2561 * for sharing files or sockets with other services. */
2562
2563 pp = path_join(params->prefix[type], "private");
2564 if (!pp) {
2565 r = -ENOMEM;
2566 goto fail;
2567 }
2568
2569 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2570 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2571 if (r < 0)
2572 goto fail;
2573
2574 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2575 r = -ENOMEM;
2576 goto fail;
2577 }
2578
2579 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2580 r = mkdir_parents_label(pp, 0755);
2581 if (r < 0)
2582 goto fail;
2583
2584 if (is_dir(p, false) > 0 &&
2585 (laccess(pp, F_OK) == -ENOENT)) {
2586
2587 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2588 * it over. Most likely the service has been upgraded from one that didn't use
2589 * DynamicUser=1, to one that does. */
2590
2591 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2592 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2593 exec_directory_type_to_string(type), p, pp);
2594
2595 r = RET_NERRNO(rename(p, pp));
2596 if (r < 0)
2597 goto fail;
2598 } else {
2599 /* Otherwise, create the actual directory for the service */
2600
2601 r = mkdir_label(pp, context->directories[type].mode);
2602 if (r < 0 && r != -EEXIST)
2603 goto fail;
2604 }
2605
2606 if (!context->directories[type].items[i].only_create) {
2607 /* And link it up from the original place.
2608 * Notes
2609 * 1) If a mount namespace is going to be used, then this symlink remains on
2610 * the host, and a new one for the child namespace will be created later.
2611 * 2) It is not necessary to create this symlink when one of its parent
2612 * directories is specified and already created. E.g.
2613 * StateDirectory=foo foo/bar
2614 * In that case, the inode points to pp and p for "foo/bar" are the same:
2615 * pp = "/var/lib/private/foo/bar"
2616 * p = "/var/lib/foo/bar"
2617 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2618 * we do not need to create the symlink, but we cannot create the symlink.
2619 * See issue #24783. */
2620 r = symlink_idempotent(pp, p, true);
2621 if (r < 0)
2622 goto fail;
2623 }
2624
2625 } else {
2626 _cleanup_free_ char *target = NULL;
2627
2628 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2629 readlink_and_make_absolute(p, &target) >= 0) {
2630 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2631
2632 /* This already exists and is a symlink? Interesting. Maybe it's one created
2633 * by DynamicUser=1 (see above)?
2634 *
2635 * We do this for all directory types except for ConfigurationDirectory=,
2636 * since they all support the private/ symlink logic at least in some
2637 * configurations, see above. */
2638
2639 r = chase(target, NULL, 0, &target_resolved, NULL);
2640 if (r < 0)
2641 goto fail;
2642
2643 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2644 if (!q) {
2645 r = -ENOMEM;
2646 goto fail;
2647 }
2648
2649 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2650 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2651 if (r < 0)
2652 goto fail;
2653
2654 if (path_equal(q_resolved, target_resolved)) {
2655
2656 /* Hmm, apparently DynamicUser= was once turned on for this service,
2657 * but is no longer. Let's move the directory back up. */
2658
2659 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2660 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2661 exec_directory_type_to_string(type), q, p);
2662
2663 r = RET_NERRNO(unlink(p));
2664 if (r < 0)
2665 goto fail;
2666
2667 r = RET_NERRNO(rename(q, p));
2668 if (r < 0)
2669 goto fail;
2670 }
2671 }
2672
2673 r = mkdir_label(p, context->directories[type].mode);
2674 if (r < 0) {
2675 if (r != -EEXIST)
2676 goto fail;
2677
2678 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2679 struct stat st;
2680
2681 /* Don't change the owner/access mode of the configuration directory,
2682 * as in the common case it is not written to by a service, and shall
2683 * not be writable. */
2684
2685 r = RET_NERRNO(stat(p, &st));
2686 if (r < 0)
2687 goto fail;
2688
2689 /* Still complain if the access mode doesn't match */
2690 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2691 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2692 "(File system: %o %sMode: %o)",
2693 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2694 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2695
2696 continue;
2697 }
2698 }
2699 }
2700
2701 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2702 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2703 * current UID/GID ownership.) */
2704 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2705 if (r < 0)
2706 goto fail;
2707
2708 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2709 * available to user code anyway */
2710 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2711 continue;
2712
2713 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2714 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2715 * assignments to exist. */
2716 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2717 if (r < 0)
2718 goto fail;
2719 }
2720
2721 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2722 * they are set up later, to allow configuring empty var/run/etc. */
2723 if (!needs_mount_namespace)
2724 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2725 r = create_many_symlinks(params->prefix[type],
2726 context->directories[type].items[i].path,
2727 context->directories[type].items[i].symlinks);
2728 if (r < 0)
2729 goto fail;
2730 }
2731
2732 return 0;
2733
2734 fail:
2735 *exit_status = exit_status_table[type];
2736 return r;
2737 }
2738
2739 #if ENABLE_SMACK
2740 static int setup_smack(
2741 const Manager *manager,
2742 const ExecContext *context,
2743 int executable_fd) {
2744 int r;
2745
2746 assert(context);
2747 assert(executable_fd >= 0);
2748
2749 if (context->smack_process_label) {
2750 r = mac_smack_apply_pid(0, context->smack_process_label);
2751 if (r < 0)
2752 return r;
2753 } else if (manager->defaults.smack_process_label) {
2754 _cleanup_free_ char *exec_label = NULL;
2755
2756 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2757 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2758 return r;
2759
2760 r = mac_smack_apply_pid(0, exec_label ?: manager->defaults.smack_process_label);
2761 if (r < 0)
2762 return r;
2763 }
2764
2765 return 0;
2766 }
2767 #endif
2768
2769 static int compile_bind_mounts(
2770 const ExecContext *context,
2771 const ExecParameters *params,
2772 BindMount **ret_bind_mounts,
2773 size_t *ret_n_bind_mounts,
2774 char ***ret_empty_directories) {
2775
2776 _cleanup_strv_free_ char **empty_directories = NULL;
2777 BindMount *bind_mounts = NULL;
2778 size_t n, h = 0;
2779 int r;
2780
2781 assert(context);
2782 assert(params);
2783 assert(ret_bind_mounts);
2784 assert(ret_n_bind_mounts);
2785 assert(ret_empty_directories);
2786
2787 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2788
2789 n = context->n_bind_mounts;
2790 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2791 if (!params->prefix[t])
2792 continue;
2793
2794 for (size_t i = 0; i < context->directories[t].n_items; i++)
2795 n += !context->directories[t].items[i].only_create;
2796 }
2797
2798 if (n <= 0) {
2799 *ret_bind_mounts = NULL;
2800 *ret_n_bind_mounts = 0;
2801 *ret_empty_directories = NULL;
2802 return 0;
2803 }
2804
2805 bind_mounts = new(BindMount, n);
2806 if (!bind_mounts)
2807 return -ENOMEM;
2808
2809 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2810 BindMount *item = context->bind_mounts + i;
2811 _cleanup_free_ char *s = NULL, *d = NULL;
2812
2813 s = strdup(item->source);
2814 if (!s)
2815 return -ENOMEM;
2816
2817 d = strdup(item->destination);
2818 if (!d)
2819 return -ENOMEM;
2820
2821 bind_mounts[h++] = (BindMount) {
2822 .source = TAKE_PTR(s),
2823 .destination = TAKE_PTR(d),
2824 .read_only = item->read_only,
2825 .recursive = item->recursive,
2826 .ignore_enoent = item->ignore_enoent,
2827 };
2828 }
2829
2830 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2831 if (!params->prefix[t])
2832 continue;
2833
2834 if (context->directories[t].n_items == 0)
2835 continue;
2836
2837 if (exec_directory_is_private(context, t) &&
2838 !exec_context_with_rootfs(context)) {
2839 char *private_root;
2840
2841 /* So this is for a dynamic user, and we need to make sure the process can access its own
2842 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2843 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2844
2845 private_root = path_join(params->prefix[t], "private");
2846 if (!private_root)
2847 return -ENOMEM;
2848
2849 r = strv_consume(&empty_directories, private_root);
2850 if (r < 0)
2851 return r;
2852 }
2853
2854 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2855 _cleanup_free_ char *s = NULL, *d = NULL;
2856
2857 /* When one of the parent directories is in the list, we cannot create the symlink
2858 * for the child directory. See also the comments in setup_exec_directory(). */
2859 if (context->directories[t].items[i].only_create)
2860 continue;
2861
2862 if (exec_directory_is_private(context, t))
2863 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2864 else
2865 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2866 if (!s)
2867 return -ENOMEM;
2868
2869 if (exec_directory_is_private(context, t) &&
2870 exec_context_with_rootfs(context))
2871 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2872 * directory is not created on the root directory. So, let's bind-mount the directory
2873 * on the 'non-private' place. */
2874 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2875 else
2876 d = strdup(s);
2877 if (!d)
2878 return -ENOMEM;
2879
2880 bind_mounts[h++] = (BindMount) {
2881 .source = TAKE_PTR(s),
2882 .destination = TAKE_PTR(d),
2883 .read_only = false,
2884 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2885 .recursive = true,
2886 .ignore_enoent = false,
2887 };
2888 }
2889 }
2890
2891 assert(h == n);
2892
2893 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2894 *ret_n_bind_mounts = n;
2895 *ret_empty_directories = TAKE_PTR(empty_directories);
2896
2897 return (int) n;
2898 }
2899
2900 /* ret_symlinks will contain a list of pairs src:dest that describes
2901 * the symlinks to create later on. For example, the symlinks needed
2902 * to safely give private directories to DynamicUser=1 users. */
2903 static int compile_symlinks(
2904 const ExecContext *context,
2905 const ExecParameters *params,
2906 bool setup_os_release_symlink,
2907 char ***ret_symlinks) {
2908
2909 _cleanup_strv_free_ char **symlinks = NULL;
2910 int r;
2911
2912 assert(context);
2913 assert(params);
2914 assert(ret_symlinks);
2915
2916 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2917 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2918 _cleanup_free_ char *private_path = NULL, *path = NULL;
2919
2920 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2921 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2922
2923 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2924 dst_abs = path_join(params->prefix[dt], *symlink);
2925 if (!src_abs || !dst_abs)
2926 return -ENOMEM;
2927
2928 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2929 if (r < 0)
2930 return r;
2931 }
2932
2933 if (!exec_directory_is_private(context, dt) ||
2934 exec_context_with_rootfs(context) ||
2935 context->directories[dt].items[i].only_create)
2936 continue;
2937
2938 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2939 if (!private_path)
2940 return -ENOMEM;
2941
2942 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2943 if (!path)
2944 return -ENOMEM;
2945
2946 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2947 if (r < 0)
2948 return r;
2949 }
2950 }
2951
2952 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2953 * and readers will never get a half-written version. Note that, while the paths specified here are
2954 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2955 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2956 if (setup_os_release_symlink) {
2957 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2958 if (r < 0)
2959 return r;
2960
2961 r = strv_extend(&symlinks, "/run/host/os-release");
2962 if (r < 0)
2963 return r;
2964 }
2965
2966 *ret_symlinks = TAKE_PTR(symlinks);
2967
2968 return 0;
2969 }
2970
2971 static bool insist_on_sandboxing(
2972 const ExecContext *context,
2973 const char *root_dir,
2974 const char *root_image,
2975 const BindMount *bind_mounts,
2976 size_t n_bind_mounts) {
2977
2978 assert(context);
2979 assert(n_bind_mounts == 0 || bind_mounts);
2980
2981 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2982 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2983 * rearrange stuff in a way we cannot ignore gracefully. */
2984
2985 if (context->n_temporary_filesystems > 0)
2986 return true;
2987
2988 if (root_dir || root_image)
2989 return true;
2990
2991 if (context->n_mount_images > 0)
2992 return true;
2993
2994 if (context->dynamic_user)
2995 return true;
2996
2997 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2998 return true;
2999
3000 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3001 * essential. */
3002 for (size_t i = 0; i < n_bind_mounts; i++)
3003 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3004 return true;
3005
3006 if (context->log_namespace)
3007 return true;
3008
3009 return false;
3010 }
3011
3012 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3013 _cleanup_close_ int fd = -EBADF;
3014 int r;
3015
3016 if (!runtime || !runtime->ephemeral_copy)
3017 return 0;
3018
3019 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3020 if (r < 0)
3021 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3022
3023 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3024
3025 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3026 if (fd >= 0)
3027 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3028 return 0;
3029
3030 if (fd != -EAGAIN)
3031 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3032
3033 log_debug("Making ephemeral snapshot of %s to %s",
3034 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3035
3036 if (context->root_image)
3037 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3038 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3039 else
3040 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3041 AT_FDCWD, runtime->ephemeral_copy,
3042 BTRFS_SNAPSHOT_FALLBACK_COPY |
3043 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3044 BTRFS_SNAPSHOT_RECURSIVE |
3045 BTRFS_SNAPSHOT_LOCK_BSD);
3046 if (fd < 0)
3047 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3048 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3049
3050 if (context->root_image) {
3051 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3052 * which tends to not perform well in combination with lots of random writes.
3053 *
3054 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3055 * copy, but we at least want to make the intention clear.
3056 */
3057 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3058 if (r < 0)
3059 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3060 }
3061
3062 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3063 if (r < 0)
3064 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3065
3066 return 1;
3067 }
3068
3069 static int verity_settings_prepare(
3070 VeritySettings *verity,
3071 const char *root_image,
3072 const void *root_hash,
3073 size_t root_hash_size,
3074 const char *root_hash_path,
3075 const void *root_hash_sig,
3076 size_t root_hash_sig_size,
3077 const char *root_hash_sig_path,
3078 const char *verity_data_path) {
3079
3080 int r;
3081
3082 assert(verity);
3083
3084 if (root_hash) {
3085 void *d;
3086
3087 d = memdup(root_hash, root_hash_size);
3088 if (!d)
3089 return -ENOMEM;
3090
3091 free_and_replace(verity->root_hash, d);
3092 verity->root_hash_size = root_hash_size;
3093 verity->designator = PARTITION_ROOT;
3094 }
3095
3096 if (root_hash_sig) {
3097 void *d;
3098
3099 d = memdup(root_hash_sig, root_hash_sig_size);
3100 if (!d)
3101 return -ENOMEM;
3102
3103 free_and_replace(verity->root_hash_sig, d);
3104 verity->root_hash_sig_size = root_hash_sig_size;
3105 verity->designator = PARTITION_ROOT;
3106 }
3107
3108 if (verity_data_path) {
3109 r = free_and_strdup(&verity->data_path, verity_data_path);
3110 if (r < 0)
3111 return r;
3112 }
3113
3114 r = verity_settings_load(
3115 verity,
3116 root_image,
3117 root_hash_path,
3118 root_hash_sig_path);
3119 if (r < 0)
3120 return log_debug_errno(r, "Failed to load root hash: %m");
3121
3122 return 0;
3123 }
3124
3125 static int apply_mount_namespace(
3126 const Unit *u,
3127 ExecCommandFlags command_flags,
3128 const ExecContext *context,
3129 const ExecParameters *params,
3130 ExecRuntime *runtime,
3131 const char *memory_pressure_path,
3132 char **error_path) {
3133
3134 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3135 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3136 **read_write_paths_cleanup = NULL;
3137 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3138 *extension_dir = NULL, *host_os_release_stage = NULL;
3139 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3140 char **read_write_paths;
3141 bool needs_sandboxing, setup_os_release_symlink;
3142 BindMount *bind_mounts = NULL;
3143 size_t n_bind_mounts = 0;
3144 int r;
3145
3146 assert(context);
3147
3148 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3149
3150 if (params->flags & EXEC_APPLY_CHROOT) {
3151 r = setup_ephemeral(context, runtime);
3152 if (r < 0)
3153 return r;
3154
3155 if (context->root_image)
3156 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3157 else
3158 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3159 }
3160
3161 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3162 if (r < 0)
3163 return r;
3164
3165 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3166 * service will need to write to it in order to start the notifications. */
3167 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3168 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3169 if (!read_write_paths_cleanup)
3170 return -ENOMEM;
3171
3172 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3173 if (r < 0)
3174 return r;
3175
3176 read_write_paths = read_write_paths_cleanup;
3177 } else
3178 read_write_paths = context->read_write_paths;
3179
3180 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3181 if (needs_sandboxing) {
3182 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3183 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3184 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3185
3186 if (context->private_tmp && runtime && runtime->shared) {
3187 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3188 tmp_dir = runtime->shared->tmp_dir;
3189 else if (runtime->shared->tmp_dir)
3190 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3191
3192 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3193 var_tmp_dir = runtime->shared->var_tmp_dir;
3194 else if (runtime->shared->var_tmp_dir)
3195 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3196 }
3197 }
3198
3199 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3200 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3201 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3202 if (r < 0)
3203 return r;
3204
3205 if (context->mount_propagation_flag == MS_SHARED)
3206 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3207
3208 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3209 r = exec_context_get_credential_directory(context, params, u->id, &creds_path);
3210 if (r < 0)
3211 return r;
3212 }
3213
3214 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3215 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3216 if (!propagate_dir)
3217 return -ENOMEM;
3218
3219 incoming_dir = strdup("/run/systemd/incoming");
3220 if (!incoming_dir)
3221 return -ENOMEM;
3222
3223 extension_dir = strdup("/run/systemd/unit-extensions");
3224 if (!extension_dir)
3225 return -ENOMEM;
3226
3227 /* If running under a different root filesystem, propagate the host's os-release. We make a
3228 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3229 if (setup_os_release_symlink) {
3230 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3231 if (!host_os_release_stage)
3232 return -ENOMEM;
3233 }
3234 } else {
3235 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3236
3237 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3238 return -ENOMEM;
3239
3240 if (setup_os_release_symlink) {
3241 if (asprintf(&host_os_release_stage,
3242 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3243 geteuid()) < 0)
3244 return -ENOMEM;
3245 }
3246 }
3247
3248 if (root_image) {
3249 r = verity_settings_prepare(
3250 &verity,
3251 root_image,
3252 context->root_hash, context->root_hash_size, context->root_hash_path,
3253 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3254 context->root_verity);
3255 if (r < 0)
3256 return r;
3257 }
3258
3259 NamespaceParameters parameters = {
3260 .runtime_scope = params->runtime_scope,
3261
3262 .root_directory = root_dir,
3263 .root_image = root_image,
3264 .root_image_options = context->root_image_options,
3265 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3266
3267 .read_write_paths = read_write_paths,
3268 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3269 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3270
3271 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3272 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3273
3274 .empty_directories = empty_directories,
3275 .symlinks = symlinks,
3276
3277 .bind_mounts = bind_mounts,
3278 .n_bind_mounts = n_bind_mounts,
3279
3280 .temporary_filesystems = context->temporary_filesystems,
3281 .n_temporary_filesystems = context->n_temporary_filesystems,
3282
3283 .mount_images = context->mount_images,
3284 .n_mount_images = context->n_mount_images,
3285 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3286
3287 .tmp_dir = tmp_dir,
3288 .var_tmp_dir = var_tmp_dir,
3289
3290 .creds_path = creds_path,
3291 .log_namespace = context->log_namespace,
3292 .mount_propagation_flag = context->mount_propagation_flag,
3293
3294 .verity = &verity,
3295
3296 .extension_images = context->extension_images,
3297 .n_extension_images = context->n_extension_images,
3298 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3299 .extension_directories = context->extension_directories,
3300
3301 .propagate_dir = propagate_dir,
3302 .incoming_dir = incoming_dir,
3303 .extension_dir = extension_dir,
3304 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3305 .host_os_release_stage = host_os_release_stage,
3306
3307 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3308 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3309 * sandbox inside the mount namespace. */
3310 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3311
3312 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3313 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3314 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3315 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3316 .protect_hostname = needs_sandboxing && context->protect_hostname,
3317
3318 .private_dev = needs_sandboxing && context->private_devices,
3319 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3320 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3321
3322 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3323
3324 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3325 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3326
3327 .protect_home = needs_sandboxing && context->protect_home,
3328 .protect_system = needs_sandboxing && context->protect_system,
3329 .protect_proc = needs_sandboxing && context->protect_proc,
3330 .proc_subset = needs_sandboxing && context->proc_subset,
3331 };
3332
3333 r = setup_namespace(&parameters, error_path);
3334 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3335 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3336 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3337 * completely different execution environment. */
3338 if (r == -ENOANO) {
3339 if (insist_on_sandboxing(
3340 context,
3341 root_dir, root_image,
3342 bind_mounts,
3343 n_bind_mounts))
3344 return log_unit_debug_errno(u,
3345 SYNTHETIC_ERRNO(EOPNOTSUPP),
3346 "Failed to set up namespace, and refusing to continue since "
3347 "the selected namespacing options alter mount environment non-trivially.\n"
3348 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3349 n_bind_mounts,
3350 context->n_temporary_filesystems,
3351 yes_no(root_dir),
3352 yes_no(root_image),
3353 yes_no(context->dynamic_user));
3354
3355 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3356 return 0;
3357 }
3358
3359 return r;
3360 }
3361
3362 static int apply_working_directory(
3363 const ExecContext *context,
3364 const ExecParameters *params,
3365 ExecRuntime *runtime,
3366 const char *home,
3367 int *exit_status) {
3368
3369 const char *d, *wd;
3370
3371 assert(context);
3372 assert(exit_status);
3373
3374 if (context->working_directory_home) {
3375
3376 if (!home) {
3377 *exit_status = EXIT_CHDIR;
3378 return -ENXIO;
3379 }
3380
3381 wd = home;
3382
3383 } else
3384 wd = empty_to_root(context->working_directory);
3385
3386 if (params->flags & EXEC_APPLY_CHROOT)
3387 d = wd;
3388 else
3389 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3390
3391 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3392 *exit_status = EXIT_CHDIR;
3393 return -errno;
3394 }
3395
3396 return 0;
3397 }
3398
3399 static int apply_root_directory(
3400 const ExecContext *context,
3401 const ExecParameters *params,
3402 ExecRuntime *runtime,
3403 const bool needs_mount_ns,
3404 int *exit_status) {
3405
3406 assert(context);
3407 assert(exit_status);
3408
3409 if (params->flags & EXEC_APPLY_CHROOT)
3410 if (!needs_mount_ns && context->root_directory)
3411 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3412 *exit_status = EXIT_CHROOT;
3413 return -errno;
3414 }
3415
3416 return 0;
3417 }
3418
3419 static int setup_keyring(
3420 const Unit *u,
3421 const ExecContext *context,
3422 const ExecParameters *p,
3423 uid_t uid, gid_t gid) {
3424
3425 key_serial_t keyring;
3426 int r = 0;
3427 uid_t saved_uid;
3428 gid_t saved_gid;
3429
3430 assert(u);
3431 assert(context);
3432 assert(p);
3433
3434 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3435 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3436 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3437 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3438 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3439 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3440
3441 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3442 return 0;
3443
3444 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3445 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3446 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3447 * & group is just as nasty as acquiring a reference to the user keyring. */
3448
3449 saved_uid = getuid();
3450 saved_gid = getgid();
3451
3452 if (gid_is_valid(gid) && gid != saved_gid) {
3453 if (setregid(gid, -1) < 0)
3454 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3455 }
3456
3457 if (uid_is_valid(uid) && uid != saved_uid) {
3458 if (setreuid(uid, -1) < 0) {
3459 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3460 goto out;
3461 }
3462 }
3463
3464 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3465 if (keyring == -1) {
3466 if (errno == ENOSYS)
3467 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3468 else if (ERRNO_IS_PRIVILEGE(errno))
3469 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3470 else if (errno == EDQUOT)
3471 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3472 else
3473 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3474
3475 goto out;
3476 }
3477
3478 /* When requested link the user keyring into the session keyring. */
3479 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3480
3481 if (keyctl(KEYCTL_LINK,
3482 KEY_SPEC_USER_KEYRING,
3483 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3484 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3485 goto out;
3486 }
3487 }
3488
3489 /* Restore uid/gid back */
3490 if (uid_is_valid(uid) && uid != saved_uid) {
3491 if (setreuid(saved_uid, -1) < 0) {
3492 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3493 goto out;
3494 }
3495 }
3496
3497 if (gid_is_valid(gid) && gid != saved_gid) {
3498 if (setregid(saved_gid, -1) < 0)
3499 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3500 }
3501
3502 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3503 if (!sd_id128_is_null(u->invocation_id)) {
3504 key_serial_t key;
3505
3506 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3507 if (key == -1)
3508 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3509 else {
3510 if (keyctl(KEYCTL_SETPERM, key,
3511 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3512 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3513 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3514 }
3515 }
3516
3517 out:
3518 /* Revert back uid & gid for the last time, and exit */
3519 /* no extra logging, as only the first already reported error matters */
3520 if (getuid() != saved_uid)
3521 (void) setreuid(saved_uid, -1);
3522
3523 if (getgid() != saved_gid)
3524 (void) setregid(saved_gid, -1);
3525
3526 return r;
3527 }
3528
3529 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3530 assert(array);
3531 assert(n);
3532 assert(pair);
3533
3534 if (pair[0] >= 0)
3535 array[(*n)++] = pair[0];
3536 if (pair[1] >= 0)
3537 array[(*n)++] = pair[1];
3538 }
3539
3540 static int close_remaining_fds(
3541 const ExecParameters *params,
3542 const ExecRuntime *runtime,
3543 int user_lookup_fd,
3544 int socket_fd,
3545 const int *fds, size_t n_fds) {
3546
3547 size_t n_dont_close = 0;
3548 int dont_close[n_fds + 14];
3549
3550 assert(params);
3551
3552 if (params->stdin_fd >= 0)
3553 dont_close[n_dont_close++] = params->stdin_fd;
3554 if (params->stdout_fd >= 0)
3555 dont_close[n_dont_close++] = params->stdout_fd;
3556 if (params->stderr_fd >= 0)
3557 dont_close[n_dont_close++] = params->stderr_fd;
3558
3559 if (socket_fd >= 0)
3560 dont_close[n_dont_close++] = socket_fd;
3561 if (n_fds > 0) {
3562 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3563 n_dont_close += n_fds;
3564 }
3565
3566 if (runtime)
3567 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3568
3569 if (runtime && runtime->shared) {
3570 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3571 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3572 }
3573
3574 if (runtime && runtime->dynamic_creds) {
3575 if (runtime->dynamic_creds->user)
3576 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3577 if (runtime->dynamic_creds->group)
3578 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3579 }
3580
3581 if (user_lookup_fd >= 0)
3582 dont_close[n_dont_close++] = user_lookup_fd;
3583
3584 return close_all_fds(dont_close, n_dont_close);
3585 }
3586
3587 static int send_user_lookup(
3588 Unit *unit,
3589 int user_lookup_fd,
3590 uid_t uid,
3591 gid_t gid) {
3592
3593 assert(unit);
3594
3595 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3596 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3597 * specified. */
3598
3599 if (user_lookup_fd < 0)
3600 return 0;
3601
3602 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3603 return 0;
3604
3605 if (writev(user_lookup_fd,
3606 (struct iovec[]) {
3607 IOVEC_MAKE(&uid, sizeof(uid)),
3608 IOVEC_MAKE(&gid, sizeof(gid)),
3609 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
3610 return -errno;
3611
3612 return 0;
3613 }
3614
3615 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3616 int r;
3617
3618 assert(c);
3619 assert(home);
3620 assert(buf);
3621
3622 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3623
3624 if (*home)
3625 return 0;
3626
3627 if (!c->working_directory_home)
3628 return 0;
3629
3630 r = get_home_dir(buf);
3631 if (r < 0)
3632 return r;
3633
3634 *home = *buf;
3635 return 1;
3636 }
3637
3638 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3639 _cleanup_strv_free_ char ** list = NULL;
3640 int r;
3641
3642 assert(c);
3643 assert(p);
3644 assert(ret);
3645
3646 assert(c->dynamic_user);
3647
3648 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3649 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3650 * directories. */
3651
3652 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3653 if (t == EXEC_DIRECTORY_CONFIGURATION)
3654 continue;
3655
3656 if (!p->prefix[t])
3657 continue;
3658
3659 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3660 char *e;
3661
3662 if (exec_directory_is_private(c, t))
3663 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3664 else
3665 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3666 if (!e)
3667 return -ENOMEM;
3668
3669 r = strv_consume(&list, e);
3670 if (r < 0)
3671 return r;
3672 }
3673 }
3674
3675 *ret = TAKE_PTR(list);
3676
3677 return 0;
3678 }
3679
3680 static int exec_parameters_get_cgroup_path(
3681 const ExecParameters *params,
3682 const CGroupContext *c,
3683 char **ret) {
3684
3685 const char *subgroup = NULL;
3686 char *p;
3687
3688 assert(params);
3689 assert(ret);
3690
3691 if (!params->cgroup_path)
3692 return -EINVAL;
3693
3694 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3695 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3696 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3697 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3698 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3699 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3700 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3701 * flag, which is only passed for the former statements, not for the latter. */
3702
3703 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3704 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3705 subgroup = ".control";
3706 else
3707 subgroup = c->delegate_subgroup;
3708 }
3709
3710 if (subgroup)
3711 p = path_join(params->cgroup_path, subgroup);
3712 else
3713 p = strdup(params->cgroup_path);
3714 if (!p)
3715 return -ENOMEM;
3716
3717 *ret = p;
3718 return !!subgroup;
3719 }
3720
3721 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3722 _cleanup_(cpu_set_reset) CPUSet s = {};
3723 int r;
3724
3725 assert(c);
3726 assert(ret);
3727
3728 if (!c->numa_policy.nodes.set) {
3729 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3730 return 0;
3731 }
3732
3733 r = numa_to_cpu_set(&c->numa_policy, &s);
3734 if (r < 0)
3735 return r;
3736
3737 cpu_set_reset(ret);
3738
3739 return cpu_set_add_all(ret, &s);
3740 }
3741
3742 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3743 assert(c);
3744
3745 return c->cpu_affinity_from_numa;
3746 }
3747
3748 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3749 int r;
3750
3751 assert(fds);
3752 assert(n_fds);
3753 assert(*n_fds < fds_size);
3754 assert(ret_fd);
3755
3756 if (fd < 0) {
3757 *ret_fd = -EBADF;
3758 return 0;
3759 }
3760
3761 if (fd < 3 + (int) *n_fds) {
3762 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3763 * the fds we pass to the process (or which are closed only during execve). */
3764
3765 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3766 if (r < 0)
3767 return -errno;
3768
3769 close_and_replace(fd, r);
3770 }
3771
3772 *ret_fd = fds[*n_fds] = fd;
3773 (*n_fds) ++;
3774 return 1;
3775 }
3776
3777 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3778 union sockaddr_union addr = {
3779 .un.sun_family = AF_UNIX,
3780 };
3781 socklen_t sa_len;
3782 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3783 int r;
3784
3785 assert(u);
3786 assert(of);
3787 assert(ofd >= 0);
3788
3789 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3790 if (r < 0)
3791 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3792
3793 sa_len = r;
3794
3795 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3796 _cleanup_close_ int fd = -EBADF;
3797
3798 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3799 if (fd < 0)
3800 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3801
3802 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3803 if (r == -EPROTOTYPE)
3804 continue;
3805 if (r < 0)
3806 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3807
3808 return TAKE_FD(fd);
3809 }
3810
3811 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3812 }
3813
3814 static int get_open_file_fd(Unit *u, const OpenFile *of) {
3815 struct stat st;
3816 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3817
3818 assert(u);
3819 assert(of);
3820
3821 ofd = open(of->path, O_PATH | O_CLOEXEC);
3822 if (ofd < 0)
3823 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3824
3825 if (fstat(ofd, &st) < 0)
3826 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
3827
3828 if (S_ISSOCK(st.st_mode)) {
3829 fd = connect_unix_harder(u, of, ofd);
3830 if (fd < 0)
3831 return fd;
3832
3833 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3834 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3835 of->path);
3836
3837 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3838 } else {
3839 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3840 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3841 flags |= O_APPEND;
3842 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3843 flags |= O_TRUNC;
3844
3845 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3846 if (fd < 0)
3847 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3848
3849 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3850 }
3851
3852 return TAKE_FD(fd);
3853 }
3854
3855 static int collect_open_file_fds(
3856 Unit *u,
3857 OpenFile* open_files,
3858 int **fds,
3859 char ***fdnames,
3860 size_t *n_fds) {
3861 int r;
3862
3863 assert(u);
3864 assert(fds);
3865 assert(fdnames);
3866 assert(n_fds);
3867
3868 LIST_FOREACH(open_files, of, open_files) {
3869 _cleanup_close_ int fd = -EBADF;
3870
3871 fd = get_open_file_fd(u, of);
3872 if (fd < 0) {
3873 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3874 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3875 continue;
3876 }
3877
3878 return fd;
3879 }
3880
3881 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3882 return -ENOMEM;
3883
3884 r = strv_extend(fdnames, of->fdname);
3885 if (r < 0)
3886 return r;
3887
3888 (*fds)[*n_fds] = TAKE_FD(fd);
3889
3890 (*n_fds)++;
3891 }
3892
3893 return 0;
3894 }
3895
3896 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3897 assert(unit);
3898 assert(msg);
3899 assert(executable);
3900
3901 if (!DEBUG_LOGGING)
3902 return;
3903
3904 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3905
3906 log_unit_struct(unit, LOG_DEBUG,
3907 "EXECUTABLE=%s", executable,
3908 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3909 LOG_UNIT_INVOCATION_ID(unit));
3910 }
3911
3912 static bool exec_context_need_unprivileged_private_users(
3913 const ExecContext *context,
3914 const ExecParameters *params) {
3915
3916 assert(context);
3917 assert(params);
3918
3919 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3920 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3921 * (system manager) then we have privileges and don't need this. */
3922 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3923 return false;
3924
3925 return context->private_users ||
3926 context->private_tmp ||
3927 context->private_devices ||
3928 context->private_network ||
3929 context->network_namespace_path ||
3930 context->private_ipc ||
3931 context->ipc_namespace_path ||
3932 context->private_mounts > 0 ||
3933 context->mount_apivfs ||
3934 context->n_bind_mounts > 0 ||
3935 context->n_temporary_filesystems > 0 ||
3936 context->root_directory ||
3937 !strv_isempty(context->extension_directories) ||
3938 context->protect_system != PROTECT_SYSTEM_NO ||
3939 context->protect_home != PROTECT_HOME_NO ||
3940 context->protect_kernel_tunables ||
3941 context->protect_kernel_modules ||
3942 context->protect_kernel_logs ||
3943 context->protect_control_groups ||
3944 context->protect_clock ||
3945 context->protect_hostname ||
3946 !strv_isempty(context->read_write_paths) ||
3947 !strv_isempty(context->read_only_paths) ||
3948 !strv_isempty(context->inaccessible_paths) ||
3949 !strv_isempty(context->exec_paths) ||
3950 !strv_isempty(context->no_exec_paths);
3951 }
3952
3953 static int exec_child(
3954 Unit *unit,
3955 const ExecCommand *command,
3956 const ExecContext *context,
3957 const ExecParameters *params,
3958 ExecRuntime *runtime,
3959 const CGroupContext *cgroup_context,
3960 int socket_fd,
3961 const int named_iofds[static 3],
3962 int *params_fds,
3963 size_t n_socket_fds,
3964 size_t n_storage_fds,
3965 char **files_env,
3966 int user_lookup_fd,
3967 int *exit_status) {
3968
3969 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3970 int r, ngids = 0, exec_fd;
3971 _cleanup_free_ gid_t *supplementary_gids = NULL;
3972 const char *username = NULL, *groupname = NULL;
3973 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3974 const char *home = NULL, *shell = NULL;
3975 char **final_argv = NULL;
3976 dev_t journal_stream_dev = 0;
3977 ino_t journal_stream_ino = 0;
3978 bool userns_set_up = false;
3979 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3980 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3981 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3982 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3983 #if HAVE_SELINUX
3984 _cleanup_free_ char *mac_selinux_context_net = NULL;
3985 bool use_selinux = false;
3986 #endif
3987 #if ENABLE_SMACK
3988 bool use_smack = false;
3989 #endif
3990 #if HAVE_APPARMOR
3991 bool use_apparmor = false;
3992 #endif
3993 uid_t saved_uid = getuid();
3994 gid_t saved_gid = getgid();
3995 uid_t uid = UID_INVALID;
3996 gid_t gid = GID_INVALID;
3997 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3998 n_keep_fds; /* total number of fds not to close */
3999 int secure_bits;
4000 _cleanup_free_ gid_t *gids_after_pam = NULL;
4001 int ngids_after_pam = 0;
4002 _cleanup_free_ int *fds = NULL;
4003 _cleanup_strv_free_ char **fdnames = NULL;
4004
4005 assert(unit);
4006 assert(command);
4007 assert(context);
4008 assert(params);
4009 assert(exit_status);
4010
4011 /* Explicitly test for CVE-2021-4034 inspired invocations */
4012 assert(command->path);
4013 assert(!strv_isempty(command->argv));
4014
4015 rename_process_from_path(command->path);
4016
4017 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4018 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4019 * both of which will be demoted to SIG_DFL. */
4020 (void) default_signals(SIGNALS_CRASH_HANDLER,
4021 SIGNALS_IGNORE);
4022
4023 if (context->ignore_sigpipe)
4024 (void) ignore_signals(SIGPIPE);
4025
4026 r = reset_signal_mask();
4027 if (r < 0) {
4028 *exit_status = EXIT_SIGNAL_MASK;
4029 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4030 }
4031
4032 if (params->idle_pipe)
4033 do_idle_pipe_dance(params->idle_pipe);
4034
4035 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4036 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4037 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4038 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4039
4040 log_forget_fds();
4041 log_set_open_when_needed(true);
4042 log_settle_target();
4043
4044 /* In case anything used libc syslog(), close this here, too */
4045 closelog();
4046
4047 fds = newdup(int, params_fds, n_fds);
4048 if (!fds) {
4049 *exit_status = EXIT_MEMORY;
4050 return log_oom();
4051 }
4052
4053 fdnames = strv_copy((char**) params->fd_names);
4054 if (!fdnames) {
4055 *exit_status = EXIT_MEMORY;
4056 return log_oom();
4057 }
4058
4059 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4060 if (r < 0) {
4061 *exit_status = EXIT_FDS;
4062 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4063 }
4064
4065 int keep_fds[n_fds + 3];
4066 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4067 n_keep_fds = n_fds;
4068
4069 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4070 if (r < 0) {
4071 *exit_status = EXIT_FDS;
4072 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4073 }
4074
4075 #if HAVE_LIBBPF
4076 if (unit->manager->restrict_fs) {
4077 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4078 if (bpf_map_fd < 0) {
4079 *exit_status = EXIT_FDS;
4080 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4081 }
4082
4083 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4084 if (r < 0) {
4085 *exit_status = EXIT_FDS;
4086 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4087 }
4088 }
4089 #endif
4090
4091 r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4092 if (r < 0) {
4093 *exit_status = EXIT_FDS;
4094 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4095 }
4096
4097 if (!context->same_pgrp &&
4098 setsid() < 0) {
4099 *exit_status = EXIT_SETSID;
4100 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4101 }
4102
4103 exec_context_tty_reset(context, params);
4104
4105 if (unit_shall_confirm_spawn(unit)) {
4106 _cleanup_free_ char *cmdline = NULL;
4107
4108 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4109 if (!cmdline) {
4110 *exit_status = EXIT_MEMORY;
4111 return log_oom();
4112 }
4113
4114 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4115 if (r != CONFIRM_EXECUTE) {
4116 if (r == CONFIRM_PRETEND_SUCCESS) {
4117 *exit_status = EXIT_SUCCESS;
4118 return 0;
4119 }
4120
4121 *exit_status = EXIT_CONFIRM;
4122 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4123 "Execution cancelled by the user");
4124 }
4125 }
4126
4127 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4128 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4129 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4130 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4131 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4132 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4133 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4134 *exit_status = EXIT_MEMORY;
4135 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4136 }
4137
4138 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4139 _cleanup_strv_free_ char **suggested_paths = NULL;
4140
4141 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4142 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4143 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4144 *exit_status = EXIT_USER;
4145 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4146 }
4147
4148 r = compile_suggested_paths(context, params, &suggested_paths);
4149 if (r < 0) {
4150 *exit_status = EXIT_MEMORY;
4151 return log_oom();
4152 }
4153
4154 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4155 if (r < 0) {
4156 *exit_status = EXIT_USER;
4157 if (r == -EILSEQ)
4158 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4159 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4160 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4161 }
4162
4163 if (!uid_is_valid(uid)) {
4164 *exit_status = EXIT_USER;
4165 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4166 }
4167
4168 if (!gid_is_valid(gid)) {
4169 *exit_status = EXIT_USER;
4170 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4171 }
4172
4173 if (runtime->dynamic_creds->user)
4174 username = runtime->dynamic_creds->user->name;
4175
4176 } else {
4177 if (context->user) {
4178 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4179 if (r < 0) {
4180 *exit_status = EXIT_USER;
4181 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4182 }
4183 }
4184
4185 if (context->group) {
4186 r = get_fixed_group(context->group, &groupname, &gid);
4187 if (r < 0) {
4188 *exit_status = EXIT_GROUP;
4189 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4190 }
4191 }
4192 }
4193
4194 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4195 r = get_supplementary_groups(context, username, groupname, gid,
4196 &supplementary_gids, &ngids);
4197 if (r < 0) {
4198 *exit_status = EXIT_GROUP;
4199 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4200 }
4201
4202 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4203 if (r < 0) {
4204 *exit_status = EXIT_USER;
4205 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4206 }
4207
4208 user_lookup_fd = safe_close(user_lookup_fd);
4209
4210 r = acquire_home(context, uid, &home, &home_buffer);
4211 if (r < 0) {
4212 *exit_status = EXIT_CHDIR;
4213 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4214 }
4215
4216 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4217 if (socket_fd >= 0)
4218 (void) fd_nonblock(socket_fd, false);
4219
4220 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4221 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4222 if (params->cgroup_path) {
4223 _cleanup_free_ char *p = NULL;
4224
4225 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4226 if (r < 0) {
4227 *exit_status = EXIT_CGROUP;
4228 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4229 }
4230
4231 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4232 if (r == -EUCLEAN) {
4233 *exit_status = EXIT_CGROUP;
4234 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4235 "because the cgroup or one of its parents or "
4236 "siblings is in the threaded mode: %m", p);
4237 }
4238 if (r < 0) {
4239 *exit_status = EXIT_CGROUP;
4240 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4241 }
4242 }
4243
4244 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4245 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4246 if (r < 0) {
4247 *exit_status = EXIT_NETWORK;
4248 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4249 }
4250 }
4251
4252 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4253 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4254 if (r < 0) {
4255 *exit_status = EXIT_NAMESPACE;
4256 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4257 }
4258 }
4259
4260 r = setup_input(context, params, socket_fd, named_iofds);
4261 if (r < 0) {
4262 *exit_status = EXIT_STDIN;
4263 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4264 }
4265
4266 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4267 if (r < 0) {
4268 *exit_status = EXIT_STDOUT;
4269 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4270 }
4271
4272 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4273 if (r < 0) {
4274 *exit_status = EXIT_STDERR;
4275 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4276 }
4277
4278 if (context->oom_score_adjust_set) {
4279 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4280 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4281 r = set_oom_score_adjust(context->oom_score_adjust);
4282 if (ERRNO_IS_NEG_PRIVILEGE(r))
4283 log_unit_debug_errno(unit, r,
4284 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4285 else if (r < 0) {
4286 *exit_status = EXIT_OOM_ADJUST;
4287 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4288 }
4289 }
4290
4291 if (context->coredump_filter_set) {
4292 r = set_coredump_filter(context->coredump_filter);
4293 if (ERRNO_IS_NEG_PRIVILEGE(r))
4294 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4295 else if (r < 0) {
4296 *exit_status = EXIT_LIMITS;
4297 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4298 }
4299 }
4300
4301 if (context->nice_set) {
4302 r = setpriority_closest(context->nice);
4303 if (r < 0) {
4304 *exit_status = EXIT_NICE;
4305 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4306 }
4307 }
4308
4309 if (context->cpu_sched_set) {
4310 struct sched_param param = {
4311 .sched_priority = context->cpu_sched_priority,
4312 };
4313
4314 r = sched_setscheduler(0,
4315 context->cpu_sched_policy |
4316 (context->cpu_sched_reset_on_fork ?
4317 SCHED_RESET_ON_FORK : 0),
4318 &param);
4319 if (r < 0) {
4320 *exit_status = EXIT_SETSCHEDULER;
4321 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4322 }
4323 }
4324
4325 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4326 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4327 const CPUSet *cpu_set;
4328
4329 if (context->cpu_affinity_from_numa) {
4330 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4331 if (r < 0) {
4332 *exit_status = EXIT_CPUAFFINITY;
4333 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4334 }
4335
4336 cpu_set = &converted_cpu_set;
4337 } else
4338 cpu_set = &context->cpu_set;
4339
4340 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4341 *exit_status = EXIT_CPUAFFINITY;
4342 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4343 }
4344 }
4345
4346 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4347 r = apply_numa_policy(&context->numa_policy);
4348 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4349 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4350 else if (r < 0) {
4351 *exit_status = EXIT_NUMA_POLICY;
4352 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4353 }
4354 }
4355
4356 if (context->ioprio_set)
4357 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4358 *exit_status = EXIT_IOPRIO;
4359 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4360 }
4361
4362 if (context->timer_slack_nsec != NSEC_INFINITY)
4363 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4364 *exit_status = EXIT_TIMERSLACK;
4365 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4366 }
4367
4368 if (context->personality != PERSONALITY_INVALID) {
4369 r = safe_personality(context->personality);
4370 if (r < 0) {
4371 *exit_status = EXIT_PERSONALITY;
4372 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4373 }
4374 }
4375
4376 if (context->utmp_id) {
4377 const char *line = context->tty_path ?
4378 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4379 NULL;
4380 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4381 line,
4382 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4383 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4384 USER_PROCESS,
4385 username);
4386 }
4387
4388 if (uid_is_valid(uid)) {
4389 r = chown_terminal(STDIN_FILENO, uid);
4390 if (r < 0) {
4391 *exit_status = EXIT_STDIN;
4392 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4393 }
4394 }
4395
4396 if (params->cgroup_path) {
4397 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4398 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4399 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4400 * touch a single hierarchy too. */
4401
4402 if (params->flags & EXEC_CGROUP_DELEGATE) {
4403 _cleanup_free_ char *p = NULL;
4404
4405 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4406 if (r < 0) {
4407 *exit_status = EXIT_CGROUP;
4408 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4409 }
4410
4411 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4412 if (r < 0) {
4413 *exit_status = EXIT_CGROUP;
4414 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4415 }
4416 if (r > 0) {
4417 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4418 if (r < 0) {
4419 *exit_status = EXIT_CGROUP;
4420 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4421 }
4422 }
4423 }
4424
4425 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4426 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4427 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4428 if (r < 0) {
4429 *exit_status = EXIT_MEMORY;
4430 return log_oom();
4431 }
4432
4433 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4434 if (r < 0) {
4435 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4436 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4437 memory_pressure_path = mfree(memory_pressure_path);
4438 }
4439 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4440 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4441 if (!memory_pressure_path) {
4442 *exit_status = EXIT_MEMORY;
4443 return log_oom();
4444 }
4445 }
4446 }
4447 }
4448
4449 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4450
4451 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4452 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4453 if (r < 0)
4454 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4455 }
4456
4457 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4458 r = exec_setup_credentials(context, params, unit->id, uid, gid);
4459 if (r < 0) {
4460 *exit_status = EXIT_CREDENTIALS;
4461 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4462 }
4463 }
4464
4465 r = build_environment(
4466 unit,
4467 context,
4468 params,
4469 cgroup_context,
4470 n_fds,
4471 fdnames,
4472 home,
4473 username,
4474 shell,
4475 journal_stream_dev,
4476 journal_stream_ino,
4477 memory_pressure_path,
4478 &our_env);
4479 if (r < 0) {
4480 *exit_status = EXIT_MEMORY;
4481 return log_oom();
4482 }
4483
4484 r = build_pass_environment(context, &pass_env);
4485 if (r < 0) {
4486 *exit_status = EXIT_MEMORY;
4487 return log_oom();
4488 }
4489
4490 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4491 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4492 * not specify PATH but the unit has ExecSearchPath. */
4493 if (!strv_isempty(context->exec_search_path)) {
4494 _cleanup_free_ char *joined = NULL;
4495
4496 joined = strv_join(context->exec_search_path, ":");
4497 if (!joined) {
4498 *exit_status = EXIT_MEMORY;
4499 return log_oom();
4500 }
4501
4502 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4503 if (r < 0) {
4504 *exit_status = EXIT_MEMORY;
4505 return log_oom();
4506 }
4507 }
4508
4509 accum_env = strv_env_merge(params->environment,
4510 our_env,
4511 joined_exec_search_path,
4512 pass_env,
4513 context->environment,
4514 files_env);
4515 if (!accum_env) {
4516 *exit_status = EXIT_MEMORY;
4517 return log_oom();
4518 }
4519 accum_env = strv_env_clean(accum_env);
4520
4521 (void) umask(context->umask);
4522
4523 r = setup_keyring(unit, context, params, uid, gid);
4524 if (r < 0) {
4525 *exit_status = EXIT_KEYRING;
4526 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4527 }
4528
4529 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4530 * from it. */
4531 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4532
4533 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4534 * for it, and the kernel doesn't actually support ambient caps. */
4535 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4536
4537 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4538 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4539 * desired. */
4540 if (needs_ambient_hack)
4541 needs_setuid = false;
4542 else
4543 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4544
4545 uint64_t capability_ambient_set = context->capability_ambient_set;
4546
4547 if (needs_sandboxing) {
4548 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4549 * /sys being present. The actual MAC context application will happen later, as late as
4550 * possible, to avoid impacting our own code paths. */
4551
4552 #if HAVE_SELINUX
4553 use_selinux = mac_selinux_use();
4554 #endif
4555 #if ENABLE_SMACK
4556 use_smack = mac_smack_use();
4557 #endif
4558 #if HAVE_APPARMOR
4559 use_apparmor = mac_apparmor_use();
4560 #endif
4561 }
4562
4563 if (needs_sandboxing) {
4564 int which_failed;
4565
4566 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4567 * is set here. (See below.) */
4568
4569 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4570 if (r < 0) {
4571 *exit_status = EXIT_LIMITS;
4572 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4573 }
4574 }
4575
4576 if (needs_setuid && context->pam_name && username) {
4577 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4578 * wins here. (See above.) */
4579
4580 /* All fds passed in the fds array will be closed in the pam child process. */
4581 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4582 if (r < 0) {
4583 *exit_status = EXIT_PAM;
4584 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4585 }
4586
4587 if (ambient_capabilities_supported()) {
4588 uint64_t ambient_after_pam;
4589
4590 /* PAM modules might have set some ambient caps. Query them here and merge them into
4591 * the caps we want to set in the end, so that we don't end up unsetting them. */
4592 r = capability_get_ambient(&ambient_after_pam);
4593 if (r < 0) {
4594 *exit_status = EXIT_CAPABILITIES;
4595 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4596 }
4597
4598 capability_ambient_set |= ambient_after_pam;
4599 }
4600
4601 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4602 if (ngids_after_pam < 0) {
4603 *exit_status = EXIT_MEMORY;
4604 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4605 }
4606 }
4607
4608 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4609 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4610 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4611 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4612
4613 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4614 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4615 * the actual requested operations fail (or silently continue). */
4616 if (r < 0 && context->private_users) {
4617 *exit_status = EXIT_USER;
4618 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4619 }
4620 if (r < 0)
4621 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4622 else
4623 userns_set_up = true;
4624 }
4625
4626 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4627
4628 /* Try to enable network namespacing if network namespacing is available and we have
4629 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4630 * new network namespace. And if we don't have that, then we could only create a network
4631 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4632 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4633 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4634 if (ERRNO_IS_NEG_PRIVILEGE(r))
4635 log_unit_notice_errno(unit, r,
4636 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4637 else if (r < 0) {
4638 *exit_status = EXIT_NETWORK;
4639 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4640 }
4641 } else if (context->network_namespace_path) {
4642 *exit_status = EXIT_NETWORK;
4643 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4644 "NetworkNamespacePath= is not supported, refusing.");
4645 } else
4646 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4647 }
4648
4649 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4650
4651 if (ns_type_supported(NAMESPACE_IPC)) {
4652 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4653 if (r == -EPERM)
4654 log_unit_warning_errno(unit, r,
4655 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4656 else if (r < 0) {
4657 *exit_status = EXIT_NAMESPACE;
4658 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4659 }
4660 } else if (context->ipc_namespace_path) {
4661 *exit_status = EXIT_NAMESPACE;
4662 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4663 "IPCNamespacePath= is not supported, refusing.");
4664 } else
4665 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4666 }
4667
4668 if (needs_mount_namespace) {
4669 _cleanup_free_ char *error_path = NULL;
4670
4671 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
4672 if (r < 0) {
4673 *exit_status = EXIT_NAMESPACE;
4674 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4675 error_path ? ": " : "", strempty(error_path));
4676 }
4677 }
4678
4679 if (needs_sandboxing) {
4680 r = apply_protect_hostname(unit, context, exit_status);
4681 if (r < 0)
4682 return r;
4683 }
4684
4685 if (context->memory_ksm >= 0)
4686 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4687 if (ERRNO_IS_NOT_SUPPORTED(errno))
4688 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4689 else {
4690 *exit_status = EXIT_KSM;
4691 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4692 }
4693 }
4694
4695 /* Drop groups as early as possible.
4696 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4697 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4698 if (needs_setuid) {
4699 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4700 int ngids_to_enforce = 0;
4701
4702 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4703 ngids,
4704 gids_after_pam,
4705 ngids_after_pam,
4706 &gids_to_enforce);
4707 if (ngids_to_enforce < 0) {
4708 *exit_status = EXIT_MEMORY;
4709 return log_unit_error_errno(unit,
4710 ngids_to_enforce,
4711 "Failed to merge group lists. Group membership might be incorrect: %m");
4712 }
4713
4714 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4715 if (r < 0) {
4716 *exit_status = EXIT_GROUP;
4717 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4718 }
4719 }
4720
4721 /* If the user namespace was not set up above, try to do it now.
4722 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4723 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4724 * case of mount namespaces being less privileged when the mount point list is copied from a
4725 * different user namespace). */
4726
4727 if (needs_sandboxing && context->private_users && !userns_set_up) {
4728 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4729 if (r < 0) {
4730 *exit_status = EXIT_USER;
4731 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4732 }
4733 }
4734
4735 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4736 * shall execute. */
4737
4738 _cleanup_free_ char *executable = NULL;
4739 _cleanup_close_ int executable_fd = -EBADF;
4740 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4741 if (r < 0) {
4742 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4743 log_unit_struct_errno(unit, LOG_INFO, r,
4744 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4745 LOG_UNIT_INVOCATION_ID(unit),
4746 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4747 command->path),
4748 "EXECUTABLE=%s", command->path);
4749 *exit_status = EXIT_SUCCESS;
4750 return 0;
4751 }
4752
4753 *exit_status = EXIT_EXEC;
4754 return log_unit_struct_errno(unit, LOG_INFO, r,
4755 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4756 LOG_UNIT_INVOCATION_ID(unit),
4757 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4758 command->path),
4759 "EXECUTABLE=%s", command->path);
4760 }
4761
4762 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4763 if (r < 0) {
4764 *exit_status = EXIT_FDS;
4765 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4766 }
4767
4768 #if HAVE_SELINUX
4769 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4770 int fd = -EBADF;
4771
4772 if (socket_fd >= 0)
4773 fd = socket_fd;
4774 else if (params->n_socket_fds == 1)
4775 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4776 * use context from that fd to compute the label. */
4777 fd = params->fds[0];
4778
4779 if (fd >= 0) {
4780 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4781 if (r < 0) {
4782 if (!context->selinux_context_ignore) {
4783 *exit_status = EXIT_SELINUX_CONTEXT;
4784 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4785 }
4786 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4787 }
4788 }
4789 }
4790 #endif
4791
4792 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4793 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4794 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4795 * execve(). */
4796
4797 r = close_all_fds(keep_fds, n_keep_fds);
4798 if (r >= 0)
4799 r = shift_fds(fds, n_fds);
4800 if (r >= 0)
4801 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4802 if (r < 0) {
4803 *exit_status = EXIT_FDS;
4804 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4805 }
4806
4807 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4808 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4809 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4810 * came this far. */
4811
4812 secure_bits = context->secure_bits;
4813
4814 if (needs_sandboxing) {
4815 uint64_t bset;
4816
4817 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4818 * (Note this is placed after the general resource limit initialization, see above, in order
4819 * to take precedence.) */
4820 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4821 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4822 *exit_status = EXIT_LIMITS;
4823 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4824 }
4825 }
4826
4827 #if ENABLE_SMACK
4828 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4829 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4830 if (use_smack) {
4831 r = setup_smack(unit->manager, context, executable_fd);
4832 if (r < 0 && !context->smack_process_label_ignore) {
4833 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4834 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4835 }
4836 }
4837 #endif
4838
4839 bset = context->capability_bounding_set;
4840 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4841 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4842 * instead of us doing that */
4843 if (needs_ambient_hack)
4844 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4845 (UINT64_C(1) << CAP_SETUID) |
4846 (UINT64_C(1) << CAP_SETGID);
4847
4848 if (!cap_test_all(bset)) {
4849 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4850 if (r < 0) {
4851 *exit_status = EXIT_CAPABILITIES;
4852 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4853 }
4854 }
4855
4856 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4857 * keep-caps set.
4858 *
4859 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4860 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4861 * the ambient capabilities can be raised as they are present in the permitted and
4862 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4863 * without changing the user, so we also set the ambient capabilities here.
4864 *
4865 * The requested ambient capabilities are raised in the inheritable set if the second
4866 * argument is true. */
4867 if (!needs_ambient_hack) {
4868 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4869 if (r < 0) {
4870 *exit_status = EXIT_CAPABILITIES;
4871 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4872 }
4873 }
4874 }
4875
4876 /* chroot to root directory first, before we lose the ability to chroot */
4877 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4878 if (r < 0)
4879 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4880
4881 if (needs_setuid) {
4882 if (uid_is_valid(uid)) {
4883 r = enforce_user(context, uid, capability_ambient_set);
4884 if (r < 0) {
4885 *exit_status = EXIT_USER;
4886 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4887 }
4888
4889 if (!needs_ambient_hack && capability_ambient_set != 0) {
4890
4891 /* Raise the ambient capabilities after user change. */
4892 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4893 if (r < 0) {
4894 *exit_status = EXIT_CAPABILITIES;
4895 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4896 }
4897 }
4898 }
4899 }
4900
4901 /* Apply working directory here, because the working directory might be on NFS and only the user running
4902 * this service might have the correct privilege to change to the working directory */
4903 r = apply_working_directory(context, params, runtime, home, exit_status);
4904 if (r < 0)
4905 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4906
4907 if (needs_sandboxing) {
4908 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4909 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4910 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4911 * are restricted. */
4912
4913 #if HAVE_SELINUX
4914 if (use_selinux) {
4915 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4916
4917 if (exec_context) {
4918 r = setexeccon(exec_context);
4919 if (r < 0) {
4920 if (!context->selinux_context_ignore) {
4921 *exit_status = EXIT_SELINUX_CONTEXT;
4922 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4923 }
4924 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4925 }
4926 }
4927 }
4928 #endif
4929
4930 #if HAVE_APPARMOR
4931 if (use_apparmor && context->apparmor_profile) {
4932 r = aa_change_onexec(context->apparmor_profile);
4933 if (r < 0 && !context->apparmor_profile_ignore) {
4934 *exit_status = EXIT_APPARMOR_PROFILE;
4935 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4936 }
4937 }
4938 #endif
4939
4940 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4941 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4942 * requires CAP_SETPCAP. */
4943 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4944 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4945 * effective set here.
4946 *
4947 * The effective set is overwritten during execve() with the following values:
4948 *
4949 * - ambient set (for non-root processes)
4950 *
4951 * - (inheritable | bounding) set for root processes)
4952 *
4953 * Hence there is no security impact to raise it in the effective set before execve
4954 */
4955 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4956 if (r < 0) {
4957 *exit_status = EXIT_CAPABILITIES;
4958 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4959 }
4960 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4961 *exit_status = EXIT_SECUREBITS;
4962 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4963 }
4964 }
4965
4966 if (context_has_no_new_privileges(context))
4967 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4968 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4969 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4970 }
4971
4972 #if HAVE_SECCOMP
4973 r = apply_address_families(unit, context);
4974 if (r < 0) {
4975 *exit_status = EXIT_ADDRESS_FAMILIES;
4976 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4977 }
4978
4979 r = apply_memory_deny_write_execute(unit, context);
4980 if (r < 0) {
4981 *exit_status = EXIT_SECCOMP;
4982 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4983 }
4984
4985 r = apply_restrict_realtime(unit, context);
4986 if (r < 0) {
4987 *exit_status = EXIT_SECCOMP;
4988 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4989 }
4990
4991 r = apply_restrict_suid_sgid(unit, context);
4992 if (r < 0) {
4993 *exit_status = EXIT_SECCOMP;
4994 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4995 }
4996
4997 r = apply_restrict_namespaces(unit, context);
4998 if (r < 0) {
4999 *exit_status = EXIT_SECCOMP;
5000 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5001 }
5002
5003 r = apply_protect_sysctl(unit, context);
5004 if (r < 0) {
5005 *exit_status = EXIT_SECCOMP;
5006 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5007 }
5008
5009 r = apply_protect_kernel_modules(unit, context);
5010 if (r < 0) {
5011 *exit_status = EXIT_SECCOMP;
5012 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5013 }
5014
5015 r = apply_protect_kernel_logs(unit, context);
5016 if (r < 0) {
5017 *exit_status = EXIT_SECCOMP;
5018 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5019 }
5020
5021 r = apply_protect_clock(unit, context);
5022 if (r < 0) {
5023 *exit_status = EXIT_SECCOMP;
5024 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5025 }
5026
5027 r = apply_private_devices(unit, context);
5028 if (r < 0) {
5029 *exit_status = EXIT_SECCOMP;
5030 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5031 }
5032
5033 r = apply_syscall_archs(unit, context);
5034 if (r < 0) {
5035 *exit_status = EXIT_SECCOMP;
5036 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5037 }
5038
5039 r = apply_lock_personality(unit, context);
5040 if (r < 0) {
5041 *exit_status = EXIT_SECCOMP;
5042 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5043 }
5044
5045 r = apply_syscall_log(unit, context);
5046 if (r < 0) {
5047 *exit_status = EXIT_SECCOMP;
5048 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5049 }
5050
5051 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5052 * by the filter as little as possible. */
5053 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5054 if (r < 0) {
5055 *exit_status = EXIT_SECCOMP;
5056 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5057 }
5058 #endif
5059
5060 #if HAVE_LIBBPF
5061 r = apply_restrict_filesystems(unit, context);
5062 if (r < 0) {
5063 *exit_status = EXIT_BPF;
5064 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5065 }
5066 #endif
5067
5068 }
5069
5070 if (!strv_isempty(context->unset_environment)) {
5071 char **ee = NULL;
5072
5073 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5074 if (!ee) {
5075 *exit_status = EXIT_MEMORY;
5076 return log_oom();
5077 }
5078
5079 strv_free_and_replace(accum_env, ee);
5080 }
5081
5082 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5083 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5084
5085 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5086 if (r < 0) {
5087 *exit_status = EXIT_MEMORY;
5088 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5089 }
5090 final_argv = replaced_argv;
5091
5092 if (!strv_isempty(unset_variables)) {
5093 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5094 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5095 }
5096
5097 if (!strv_isempty(bad_variables)) {
5098 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5099 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5100 }
5101 } else
5102 final_argv = command->argv;
5103
5104 log_command_line(unit, "Executing", executable, final_argv);
5105
5106 if (exec_fd >= 0) {
5107 uint8_t hot = 1;
5108
5109 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5110 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5111
5112 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5113 *exit_status = EXIT_EXEC;
5114 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5115 }
5116 }
5117
5118 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5119
5120 if (exec_fd >= 0) {
5121 uint8_t hot = 0;
5122
5123 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5124 * that POLLHUP on it no longer means execve() succeeded. */
5125
5126 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5127 *exit_status = EXIT_EXEC;
5128 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5129 }
5130 }
5131
5132 *exit_status = EXIT_EXEC;
5133 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5134 }
5135
5136 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5137 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5138
5139 int exec_spawn(Unit *unit,
5140 ExecCommand *command,
5141 const ExecContext *context,
5142 const ExecParameters *params,
5143 ExecRuntime *runtime,
5144 const CGroupContext *cgroup_context,
5145 pid_t *ret) {
5146
5147 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5148 _cleanup_free_ char *subcgroup_path = NULL;
5149 _cleanup_strv_free_ char **files_env = NULL;
5150 size_t n_storage_fds = 0, n_socket_fds = 0;
5151 pid_t pid;
5152
5153 assert(unit);
5154 assert(command);
5155 assert(context);
5156 assert(ret);
5157 assert(params);
5158 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5159
5160 LOG_CONTEXT_PUSH_UNIT(unit);
5161
5162 if (context->std_input == EXEC_INPUT_SOCKET ||
5163 context->std_output == EXEC_OUTPUT_SOCKET ||
5164 context->std_error == EXEC_OUTPUT_SOCKET) {
5165
5166 if (params->n_socket_fds > 1)
5167 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5168
5169 if (params->n_socket_fds == 0)
5170 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5171
5172 socket_fd = params->fds[0];
5173 } else {
5174 socket_fd = -EBADF;
5175 fds = params->fds;
5176 n_socket_fds = params->n_socket_fds;
5177 n_storage_fds = params->n_storage_fds;
5178 }
5179
5180 r = exec_context_named_iofds(context, params, named_iofds);
5181 if (r < 0)
5182 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5183
5184 r = exec_context_load_environment(unit, context, &files_env);
5185 if (r < 0)
5186 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5187
5188 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5189 and, until the next SELinux policy changes, we save further reloads in future children. */
5190 mac_selinux_maybe_reload();
5191
5192 /* We won't know the real executable path until we create the mount namespace in the child, but we
5193 want to log from the parent, so we use the possibly inaccurate path here. */
5194 log_command_line(unit, "About to execute", command->path, command->argv);
5195
5196 if (params->cgroup_path) {
5197 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
5198 if (r < 0)
5199 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5200 if (r > 0) {
5201 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5202 * realized by the unit logic) */
5203
5204 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5205 if (r < 0)
5206 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
5207 }
5208 }
5209
5210 pid = fork();
5211 if (pid < 0)
5212 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5213
5214 if (pid == 0) {
5215 int exit_status;
5216
5217 r = exec_child(unit,
5218 command,
5219 context,
5220 params,
5221 runtime,
5222 cgroup_context,
5223 socket_fd,
5224 named_iofds,
5225 fds,
5226 n_socket_fds,
5227 n_storage_fds,
5228 files_env,
5229 unit->manager->user_lookup_fds[1],
5230 &exit_status);
5231
5232 if (r < 0) {
5233 const char *status = ASSERT_PTR(
5234 exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
5235
5236 log_unit_struct_errno(unit, LOG_ERR, r,
5237 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5238 LOG_UNIT_INVOCATION_ID(unit),
5239 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5240 status, command->path),
5241 "EXECUTABLE=%s", command->path);
5242 } else
5243 assert(exit_status == EXIT_SUCCESS);
5244
5245 _exit(exit_status);
5246 }
5247
5248 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5249
5250 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5251 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5252 * process will be killed too). */
5253 if (subcgroup_path)
5254 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5255
5256 exec_status_start(&command->exec_status, pid);
5257
5258 *ret = pid;
5259 return 0;
5260 }
5261
5262 void exec_context_init(ExecContext *c) {
5263 assert(c);
5264
5265 *c = (ExecContext) {
5266 .umask = 0022,
5267 .ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO,
5268 .cpu_sched_policy = SCHED_OTHER,
5269 .syslog_priority = LOG_DAEMON|LOG_INFO,
5270 .syslog_level_prefix = true,
5271 .ignore_sigpipe = true,
5272 .timer_slack_nsec = NSEC_INFINITY,
5273 .personality = PERSONALITY_INVALID,
5274 .timeout_clean_usec = USEC_INFINITY,
5275 .capability_bounding_set = CAP_MASK_UNSET,
5276 .restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
5277 .log_level_max = -1,
5278 #if HAVE_SECCOMP
5279 .syscall_errno = SECCOMP_ERROR_NUMBER_KILL,
5280 #endif
5281 .tty_rows = UINT_MAX,
5282 .tty_cols = UINT_MAX,
5283 .private_mounts = -1,
5284 .memory_ksm = -1,
5285 .set_login_environment = -1,
5286 };
5287
5288 FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
5289 d->mode = 0755;
5290
5291 numa_policy_reset(&c->numa_policy);
5292
5293 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5294 }
5295
5296 void exec_context_done(ExecContext *c) {
5297 assert(c);
5298
5299 c->environment = strv_free(c->environment);
5300 c->environment_files = strv_free(c->environment_files);
5301 c->pass_environment = strv_free(c->pass_environment);
5302 c->unset_environment = strv_free(c->unset_environment);
5303
5304 rlimit_free_all(c->rlimit);
5305
5306 for (size_t l = 0; l < 3; l++) {
5307 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5308 c->stdio_file[l] = mfree(c->stdio_file[l]);
5309 }
5310
5311 c->working_directory = mfree(c->working_directory);
5312 c->root_directory = mfree(c->root_directory);
5313 c->root_image = mfree(c->root_image);
5314 c->root_image_options = mount_options_free_all(c->root_image_options);
5315 c->root_hash = mfree(c->root_hash);
5316 c->root_hash_size = 0;
5317 c->root_hash_path = mfree(c->root_hash_path);
5318 c->root_hash_sig = mfree(c->root_hash_sig);
5319 c->root_hash_sig_size = 0;
5320 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5321 c->root_verity = mfree(c->root_verity);
5322 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5323 c->extension_directories = strv_free(c->extension_directories);
5324 c->tty_path = mfree(c->tty_path);
5325 c->syslog_identifier = mfree(c->syslog_identifier);
5326 c->user = mfree(c->user);
5327 c->group = mfree(c->group);
5328
5329 c->supplementary_groups = strv_free(c->supplementary_groups);
5330
5331 c->pam_name = mfree(c->pam_name);
5332
5333 c->read_only_paths = strv_free(c->read_only_paths);
5334 c->read_write_paths = strv_free(c->read_write_paths);
5335 c->inaccessible_paths = strv_free(c->inaccessible_paths);
5336 c->exec_paths = strv_free(c->exec_paths);
5337 c->no_exec_paths = strv_free(c->no_exec_paths);
5338 c->exec_search_path = strv_free(c->exec_search_path);
5339
5340 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5341 c->bind_mounts = NULL;
5342 c->n_bind_mounts = 0;
5343 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5344 c->temporary_filesystems = NULL;
5345 c->n_temporary_filesystems = 0;
5346 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5347
5348 cpu_set_reset(&c->cpu_set);
5349 numa_policy_reset(&c->numa_policy);
5350
5351 c->utmp_id = mfree(c->utmp_id);
5352 c->selinux_context = mfree(c->selinux_context);
5353 c->apparmor_profile = mfree(c->apparmor_profile);
5354 c->smack_process_label = mfree(c->smack_process_label);
5355
5356 c->restrict_filesystems = set_free_free(c->restrict_filesystems);
5357
5358 c->syscall_filter = hashmap_free(c->syscall_filter);
5359 c->syscall_archs = set_free(c->syscall_archs);
5360 c->address_families = set_free(c->address_families);
5361
5362 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5363 exec_directory_done(&c->directories[t]);
5364
5365 c->log_level_max = -1;
5366
5367 exec_context_free_log_extra_fields(c);
5368 c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5369 c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
5370
5371 c->log_ratelimit_interval_usec = 0;
5372 c->log_ratelimit_burst = 0;
5373
5374 c->stdin_data = mfree(c->stdin_data);
5375 c->stdin_data_size = 0;
5376
5377 c->network_namespace_path = mfree(c->network_namespace_path);
5378 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5379
5380 c->log_namespace = mfree(c->log_namespace);
5381
5382 c->load_credentials = hashmap_free(c->load_credentials);
5383 c->set_credentials = hashmap_free(c->set_credentials);
5384 c->import_credentials = set_free_free(c->import_credentials);
5385
5386 c->root_image_policy = image_policy_free(c->root_image_policy);
5387 c->mount_image_policy = image_policy_free(c->mount_image_policy);
5388 c->extension_image_policy = image_policy_free(c->extension_image_policy);
5389 }
5390
5391 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5392 assert(c);
5393
5394 if (!runtime_prefix)
5395 return 0;
5396
5397 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5398 _cleanup_free_ char *p = NULL;
5399
5400 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5401 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5402 else
5403 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5404 if (!p)
5405 return -ENOMEM;
5406
5407 /* We execute this synchronously, since we need to be sure this is gone when we start the
5408 * service next. */
5409 (void) rm_rf(p, REMOVE_ROOT);
5410
5411 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5412 _cleanup_free_ char *symlink_abs = NULL;
5413
5414 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5415 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5416 else
5417 symlink_abs = path_join(runtime_prefix, *symlink);
5418 if (!symlink_abs)
5419 return -ENOMEM;
5420
5421 (void) unlink(symlink_abs);
5422 }
5423 }
5424
5425 return 0;
5426 }
5427
5428 int exec_context_destroy_mount_ns_dir(Unit *u) {
5429 _cleanup_free_ char *p = NULL;
5430
5431 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5432 return 0;
5433
5434 p = path_join("/run/systemd/propagate/", u->id);
5435 if (!p)
5436 return -ENOMEM;
5437
5438 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5439 if (rmdir(p) < 0 && errno != ENOENT)
5440 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5441
5442 return 0;
5443 }
5444
5445 static void exec_command_done(ExecCommand *c) {
5446 assert(c);
5447
5448 c->path = mfree(c->path);
5449 c->argv = strv_free(c->argv);
5450 }
5451
5452 void exec_command_done_array(ExecCommand *c, size_t n) {
5453 for (size_t i = 0; i < n; i++)
5454 exec_command_done(c+i);
5455 }
5456
5457 ExecCommand* exec_command_free_list(ExecCommand *c) {
5458 ExecCommand *i;
5459
5460 while ((i = LIST_POP(command, c))) {
5461 exec_command_done(i);
5462 free(i);
5463 }
5464
5465 return NULL;
5466 }
5467
5468 void exec_command_free_array(ExecCommand **c, size_t n) {
5469 for (size_t i = 0; i < n; i++)
5470 c[i] = exec_command_free_list(c[i]);
5471 }
5472
5473 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5474 for (size_t i = 0; i < n; i++)
5475 exec_status_reset(&c[i].exec_status);
5476 }
5477
5478 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5479 for (size_t i = 0; i < n; i++)
5480 LIST_FOREACH(command, z, c[i])
5481 exec_status_reset(&z->exec_status);
5482 }
5483
5484 typedef struct InvalidEnvInfo {
5485 const Unit *unit;
5486 const char *path;
5487 } InvalidEnvInfo;
5488
5489 static void invalid_env(const char *p, void *userdata) {
5490 InvalidEnvInfo *info = userdata;
5491
5492 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5493 }
5494
5495 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5496 assert(c);
5497
5498 switch (fd_index) {
5499
5500 case STDIN_FILENO:
5501 if (c->std_input != EXEC_INPUT_NAMED_FD)
5502 return NULL;
5503
5504 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5505
5506 case STDOUT_FILENO:
5507 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5508 return NULL;
5509
5510 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5511
5512 case STDERR_FILENO:
5513 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5514 return NULL;
5515
5516 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5517
5518 default:
5519 return NULL;
5520 }
5521 }
5522
5523 static int exec_context_named_iofds(
5524 const ExecContext *c,
5525 const ExecParameters *p,
5526 int named_iofds[static 3]) {
5527
5528 size_t targets;
5529 const char* stdio_fdname[3];
5530 size_t n_fds;
5531
5532 assert(c);
5533 assert(p);
5534 assert(named_iofds);
5535
5536 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5537 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5538 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5539
5540 for (size_t i = 0; i < 3; i++)
5541 stdio_fdname[i] = exec_context_fdname(c, i);
5542
5543 n_fds = p->n_storage_fds + p->n_socket_fds;
5544
5545 for (size_t i = 0; i < n_fds && targets > 0; i++)
5546 if (named_iofds[STDIN_FILENO] < 0 &&
5547 c->std_input == EXEC_INPUT_NAMED_FD &&
5548 stdio_fdname[STDIN_FILENO] &&
5549 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5550
5551 named_iofds[STDIN_FILENO] = p->fds[i];
5552 targets--;
5553
5554 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5555 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5556 stdio_fdname[STDOUT_FILENO] &&
5557 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5558
5559 named_iofds[STDOUT_FILENO] = p->fds[i];
5560 targets--;
5561
5562 } else if (named_iofds[STDERR_FILENO] < 0 &&
5563 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5564 stdio_fdname[STDERR_FILENO] &&
5565 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5566
5567 named_iofds[STDERR_FILENO] = p->fds[i];
5568 targets--;
5569 }
5570
5571 return targets == 0 ? 0 : -ENOENT;
5572 }
5573
5574 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5575 _cleanup_strv_free_ char **v = NULL;
5576 int r;
5577
5578 assert(c);
5579 assert(ret);
5580
5581 STRV_FOREACH(i, c->environment_files) {
5582 _cleanup_globfree_ glob_t pglob = {};
5583 bool ignore = false;
5584 char *fn = *i;
5585
5586 if (fn[0] == '-') {
5587 ignore = true;
5588 fn++;
5589 }
5590
5591 if (!path_is_absolute(fn)) {
5592 if (ignore)
5593 continue;
5594 return -EINVAL;
5595 }
5596
5597 /* Filename supports globbing, take all matching files */
5598 r = safe_glob(fn, 0, &pglob);
5599 if (r < 0) {
5600 if (ignore)
5601 continue;
5602 return r;
5603 }
5604
5605 /* When we don't match anything, -ENOENT should be returned */
5606 assert(pglob.gl_pathc > 0);
5607
5608 for (size_t n = 0; n < pglob.gl_pathc; n++) {
5609 _cleanup_strv_free_ char **p = NULL;
5610
5611 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5612 if (r < 0) {
5613 if (ignore)
5614 continue;
5615 return r;
5616 }
5617
5618 /* Log invalid environment variables with filename */
5619 if (p) {
5620 InvalidEnvInfo info = {
5621 .unit = unit,
5622 .path = pglob.gl_pathv[n]
5623 };
5624
5625 p = strv_env_clean_with_callback(p, invalid_env, &info);
5626 }
5627
5628 if (!v)
5629 v = TAKE_PTR(p);
5630 else {
5631 char **m = strv_env_merge(v, p);
5632 if (!m)
5633 return -ENOMEM;
5634
5635 strv_free_and_replace(v, m);
5636 }
5637 }
5638 }
5639
5640 *ret = TAKE_PTR(v);
5641
5642 return 0;
5643 }
5644
5645 static bool tty_may_match_dev_console(const char *tty) {
5646 _cleanup_free_ char *resolved = NULL;
5647
5648 if (!tty)
5649 return true;
5650
5651 tty = skip_dev_prefix(tty);
5652
5653 /* trivial identity? */
5654 if (streq(tty, "console"))
5655 return true;
5656
5657 if (resolve_dev_console(&resolved) < 0)
5658 return true; /* if we could not resolve, assume it may */
5659
5660 /* "tty0" means the active VC, so it may be the same sometimes */
5661 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5662 }
5663
5664 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5665 assert(ec);
5666
5667 return ec->tty_reset ||
5668 ec->tty_vhangup ||
5669 ec->tty_vt_disallocate ||
5670 is_terminal_input(ec->std_input) ||
5671 is_terminal_output(ec->std_output) ||
5672 is_terminal_output(ec->std_error);
5673 }
5674
5675 bool exec_context_may_touch_console(const ExecContext *ec) {
5676
5677 return exec_context_may_touch_tty(ec) &&
5678 tty_may_match_dev_console(exec_context_tty_path(ec));
5679 }
5680
5681 static void strv_fprintf(FILE *f, char **l) {
5682 assert(f);
5683
5684 STRV_FOREACH(g, l)
5685 fprintf(f, " %s", *g);
5686 }
5687
5688 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5689 assert(f);
5690 assert(prefix);
5691 assert(name);
5692
5693 if (!strv_isempty(strv)) {
5694 fprintf(f, "%s%s:", prefix, name);
5695 strv_fprintf(f, strv);
5696 fputs("\n", f);
5697 }
5698 }
5699
5700 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5701 int r;
5702
5703 assert(c);
5704 assert(f);
5705
5706 prefix = strempty(prefix);
5707
5708 fprintf(f,
5709 "%sUMask: %04o\n"
5710 "%sWorkingDirectory: %s\n"
5711 "%sRootDirectory: %s\n"
5712 "%sRootEphemeral: %s\n"
5713 "%sNonBlocking: %s\n"
5714 "%sPrivateTmp: %s\n"
5715 "%sPrivateDevices: %s\n"
5716 "%sProtectKernelTunables: %s\n"
5717 "%sProtectKernelModules: %s\n"
5718 "%sProtectKernelLogs: %s\n"
5719 "%sProtectClock: %s\n"
5720 "%sProtectControlGroups: %s\n"
5721 "%sPrivateNetwork: %s\n"
5722 "%sPrivateUsers: %s\n"
5723 "%sProtectHome: %s\n"
5724 "%sProtectSystem: %s\n"
5725 "%sMountAPIVFS: %s\n"
5726 "%sIgnoreSIGPIPE: %s\n"
5727 "%sMemoryDenyWriteExecute: %s\n"
5728 "%sRestrictRealtime: %s\n"
5729 "%sRestrictSUIDSGID: %s\n"
5730 "%sKeyringMode: %s\n"
5731 "%sProtectHostname: %s\n"
5732 "%sProtectProc: %s\n"
5733 "%sProcSubset: %s\n",
5734 prefix, c->umask,
5735 prefix, empty_to_root(c->working_directory),
5736 prefix, empty_to_root(c->root_directory),
5737 prefix, yes_no(c->root_ephemeral),
5738 prefix, yes_no(c->non_blocking),
5739 prefix, yes_no(c->private_tmp),
5740 prefix, yes_no(c->private_devices),
5741 prefix, yes_no(c->protect_kernel_tunables),
5742 prefix, yes_no(c->protect_kernel_modules),
5743 prefix, yes_no(c->protect_kernel_logs),
5744 prefix, yes_no(c->protect_clock),
5745 prefix, yes_no(c->protect_control_groups),
5746 prefix, yes_no(c->private_network),
5747 prefix, yes_no(c->private_users),
5748 prefix, protect_home_to_string(c->protect_home),
5749 prefix, protect_system_to_string(c->protect_system),
5750 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5751 prefix, yes_no(c->ignore_sigpipe),
5752 prefix, yes_no(c->memory_deny_write_execute),
5753 prefix, yes_no(c->restrict_realtime),
5754 prefix, yes_no(c->restrict_suid_sgid),
5755 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5756 prefix, yes_no(c->protect_hostname),
5757 prefix, protect_proc_to_string(c->protect_proc),
5758 prefix, proc_subset_to_string(c->proc_subset));
5759
5760 if (c->root_image)
5761 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5762
5763 if (c->root_image_options) {
5764 fprintf(f, "%sRootImageOptions:", prefix);
5765 LIST_FOREACH(mount_options, o, c->root_image_options)
5766 if (!isempty(o->options))
5767 fprintf(f, " %s:%s",
5768 partition_designator_to_string(o->partition_designator),
5769 o->options);
5770 fprintf(f, "\n");
5771 }
5772
5773 if (c->root_hash) {
5774 _cleanup_free_ char *encoded = NULL;
5775 encoded = hexmem(c->root_hash, c->root_hash_size);
5776 if (encoded)
5777 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5778 }
5779
5780 if (c->root_hash_path)
5781 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5782
5783 if (c->root_hash_sig) {
5784 _cleanup_free_ char *encoded = NULL;
5785 ssize_t len;
5786 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5787 if (len)
5788 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5789 }
5790
5791 if (c->root_hash_sig_path)
5792 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5793
5794 if (c->root_verity)
5795 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5796
5797 STRV_FOREACH(e, c->environment)
5798 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5799
5800 STRV_FOREACH(e, c->environment_files)
5801 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5802
5803 STRV_FOREACH(e, c->pass_environment)
5804 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5805
5806 STRV_FOREACH(e, c->unset_environment)
5807 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5808
5809 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5810
5811 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5812 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5813
5814 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5815 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5816
5817 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5818 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5819 }
5820 }
5821
5822 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5823
5824 if (c->nice_set)
5825 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5826
5827 if (c->oom_score_adjust_set)
5828 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5829
5830 if (c->coredump_filter_set)
5831 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5832
5833 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5834 if (c->rlimit[i]) {
5835 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5836 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5837 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5838 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5839 }
5840
5841 if (c->ioprio_set) {
5842 _cleanup_free_ char *class_str = NULL;
5843
5844 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5845 if (r >= 0)
5846 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5847
5848 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5849 }
5850
5851 if (c->cpu_sched_set) {
5852 _cleanup_free_ char *policy_str = NULL;
5853
5854 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5855 if (r >= 0)
5856 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5857
5858 fprintf(f,
5859 "%sCPUSchedulingPriority: %i\n"
5860 "%sCPUSchedulingResetOnFork: %s\n",
5861 prefix, c->cpu_sched_priority,
5862 prefix, yes_no(c->cpu_sched_reset_on_fork));
5863 }
5864
5865 if (c->cpu_set.set) {
5866 _cleanup_free_ char *affinity = NULL;
5867
5868 affinity = cpu_set_to_range_string(&c->cpu_set);
5869 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5870 }
5871
5872 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5873 _cleanup_free_ char *nodes = NULL;
5874
5875 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5876 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5877 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5878 }
5879
5880 if (c->timer_slack_nsec != NSEC_INFINITY)
5881 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5882
5883 fprintf(f,
5884 "%sStandardInput: %s\n"
5885 "%sStandardOutput: %s\n"
5886 "%sStandardError: %s\n",
5887 prefix, exec_input_to_string(c->std_input),
5888 prefix, exec_output_to_string(c->std_output),
5889 prefix, exec_output_to_string(c->std_error));
5890
5891 if (c->std_input == EXEC_INPUT_NAMED_FD)
5892 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5893 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5894 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5895 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5896 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5897
5898 if (c->std_input == EXEC_INPUT_FILE)
5899 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5900 if (c->std_output == EXEC_OUTPUT_FILE)
5901 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5902 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5903 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5904 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5905 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5906 if (c->std_error == EXEC_OUTPUT_FILE)
5907 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5908 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5909 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5910 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5911 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5912
5913 if (c->tty_path)
5914 fprintf(f,
5915 "%sTTYPath: %s\n"
5916 "%sTTYReset: %s\n"
5917 "%sTTYVHangup: %s\n"
5918 "%sTTYVTDisallocate: %s\n"
5919 "%sTTYRows: %u\n"
5920 "%sTTYColumns: %u\n",
5921 prefix, c->tty_path,
5922 prefix, yes_no(c->tty_reset),
5923 prefix, yes_no(c->tty_vhangup),
5924 prefix, yes_no(c->tty_vt_disallocate),
5925 prefix, c->tty_rows,
5926 prefix, c->tty_cols);
5927
5928 if (IN_SET(c->std_output,
5929 EXEC_OUTPUT_KMSG,
5930 EXEC_OUTPUT_JOURNAL,
5931 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5932 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5933 IN_SET(c->std_error,
5934 EXEC_OUTPUT_KMSG,
5935 EXEC_OUTPUT_JOURNAL,
5936 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5937 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5938
5939 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5940
5941 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5942 if (r >= 0)
5943 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5944
5945 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5946 if (r >= 0)
5947 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5948 }
5949
5950 if (c->log_level_max >= 0) {
5951 _cleanup_free_ char *t = NULL;
5952
5953 (void) log_level_to_string_alloc(c->log_level_max, &t);
5954
5955 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5956 }
5957
5958 if (c->log_ratelimit_interval_usec > 0)
5959 fprintf(f,
5960 "%sLogRateLimitIntervalSec: %s\n",
5961 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5962
5963 if (c->log_ratelimit_burst > 0)
5964 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5965
5966 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5967 fprintf(f, "%sLogFilterPatterns:", prefix);
5968
5969 char *pattern;
5970 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
5971 fprintf(f, " %s", pattern);
5972 SET_FOREACH(pattern, c->log_filter_denied_patterns)
5973 fprintf(f, " ~%s", pattern);
5974 fputc('\n', f);
5975 }
5976
5977 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5978 fprintf(f, "%sLogExtraFields: ", prefix);
5979 fwrite(c->log_extra_fields[j].iov_base,
5980 1, c->log_extra_fields[j].iov_len,
5981 f);
5982 fputc('\n', f);
5983 }
5984
5985 if (c->log_namespace)
5986 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5987
5988 if (c->secure_bits) {
5989 _cleanup_free_ char *str = NULL;
5990
5991 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5992 if (r >= 0)
5993 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5994 }
5995
5996 if (c->capability_bounding_set != CAP_MASK_UNSET) {
5997 _cleanup_free_ char *str = NULL;
5998
5999 r = capability_set_to_string(c->capability_bounding_set, &str);
6000 if (r >= 0)
6001 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6002 }
6003
6004 if (c->capability_ambient_set != 0) {
6005 _cleanup_free_ char *str = NULL;
6006
6007 r = capability_set_to_string(c->capability_ambient_set, &str);
6008 if (r >= 0)
6009 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6010 }
6011
6012 if (c->user)
6013 fprintf(f, "%sUser: %s\n", prefix, c->user);
6014 if (c->group)
6015 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6016
6017 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6018
6019 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6020
6021 if (c->pam_name)
6022 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6023
6024 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6025 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6026 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6027 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6028 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6029 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6030
6031 for (size_t i = 0; i < c->n_bind_mounts; i++)
6032 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6033 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6034 c->bind_mounts[i].ignore_enoent ? "-": "",
6035 c->bind_mounts[i].source,
6036 c->bind_mounts[i].destination,
6037 c->bind_mounts[i].recursive ? "rbind" : "norbind");
6038
6039 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6040 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6041
6042 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6043 t->path,
6044 isempty(t->options) ? "" : ":",
6045 strempty(t->options));
6046 }
6047
6048 if (c->utmp_id)
6049 fprintf(f,
6050 "%sUtmpIdentifier: %s\n",
6051 prefix, c->utmp_id);
6052
6053 if (c->selinux_context)
6054 fprintf(f,
6055 "%sSELinuxContext: %s%s\n",
6056 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6057
6058 if (c->apparmor_profile)
6059 fprintf(f,
6060 "%sAppArmorProfile: %s%s\n",
6061 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6062
6063 if (c->smack_process_label)
6064 fprintf(f,
6065 "%sSmackProcessLabel: %s%s\n",
6066 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6067
6068 if (c->personality != PERSONALITY_INVALID)
6069 fprintf(f,
6070 "%sPersonality: %s\n",
6071 prefix, strna(personality_to_string(c->personality)));
6072
6073 fprintf(f,
6074 "%sLockPersonality: %s\n",
6075 prefix, yes_no(c->lock_personality));
6076
6077 if (c->syscall_filter) {
6078 fprintf(f,
6079 "%sSystemCallFilter: ",
6080 prefix);
6081
6082 if (!c->syscall_allow_list)
6083 fputc('~', f);
6084
6085 #if HAVE_SECCOMP
6086 void *id, *val;
6087 bool first = true;
6088 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6089 _cleanup_free_ char *name = NULL;
6090 const char *errno_name = NULL;
6091 int num = PTR_TO_INT(val);
6092
6093 if (first)
6094 first = false;
6095 else
6096 fputc(' ', f);
6097
6098 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6099 fputs(strna(name), f);
6100
6101 if (num >= 0) {
6102 errno_name = seccomp_errno_or_action_to_string(num);
6103 if (errno_name)
6104 fprintf(f, ":%s", errno_name);
6105 else
6106 fprintf(f, ":%d", num);
6107 }
6108 }
6109 #endif
6110
6111 fputc('\n', f);
6112 }
6113
6114 if (c->syscall_archs) {
6115 fprintf(f,
6116 "%sSystemCallArchitectures:",
6117 prefix);
6118
6119 #if HAVE_SECCOMP
6120 void *id;
6121 SET_FOREACH(id, c->syscall_archs)
6122 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6123 #endif
6124 fputc('\n', f);
6125 }
6126
6127 if (exec_context_restrict_namespaces_set(c)) {
6128 _cleanup_free_ char *s = NULL;
6129
6130 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6131 if (r >= 0)
6132 fprintf(f, "%sRestrictNamespaces: %s\n",
6133 prefix, strna(s));
6134 }
6135
6136 #if HAVE_LIBBPF
6137 if (exec_context_restrict_filesystems_set(c)) {
6138 char *fs;
6139 SET_FOREACH(fs, c->restrict_filesystems)
6140 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6141 }
6142 #endif
6143
6144 if (c->network_namespace_path)
6145 fprintf(f,
6146 "%sNetworkNamespacePath: %s\n",
6147 prefix, c->network_namespace_path);
6148
6149 if (c->syscall_errno > 0) {
6150 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6151
6152 #if HAVE_SECCOMP
6153 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6154 if (errno_name)
6155 fputs(errno_name, f);
6156 else
6157 fprintf(f, "%d", c->syscall_errno);
6158 #endif
6159 fputc('\n', f);
6160 }
6161
6162 for (size_t i = 0; i < c->n_mount_images; i++) {
6163 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6164 c->mount_images[i].ignore_enoent ? "-": "",
6165 c->mount_images[i].source,
6166 c->mount_images[i].destination);
6167 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6168 fprintf(f, ":%s:%s",
6169 partition_designator_to_string(o->partition_designator),
6170 strempty(o->options));
6171 fprintf(f, "\n");
6172 }
6173
6174 for (size_t i = 0; i < c->n_extension_images; i++) {
6175 fprintf(f, "%sExtensionImages: %s%s", prefix,
6176 c->extension_images[i].ignore_enoent ? "-": "",
6177 c->extension_images[i].source);
6178 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6179 fprintf(f, ":%s:%s",
6180 partition_designator_to_string(o->partition_designator),
6181 strempty(o->options));
6182 fprintf(f, "\n");
6183 }
6184
6185 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6186 }
6187
6188 bool exec_context_maintains_privileges(const ExecContext *c) {
6189 assert(c);
6190
6191 /* Returns true if the process forked off would run under
6192 * an unchanged UID or as root. */
6193
6194 if (!c->user)
6195 return true;
6196
6197 if (streq(c->user, "root") || streq(c->user, "0"))
6198 return true;
6199
6200 return false;
6201 }
6202
6203 int exec_context_get_effective_ioprio(const ExecContext *c) {
6204 int p;
6205
6206 assert(c);
6207
6208 if (c->ioprio_set)
6209 return c->ioprio;
6210
6211 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6212 if (p < 0)
6213 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6214
6215 return ioprio_normalize(p);
6216 }
6217
6218 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6219 assert(c);
6220
6221 /* Explicit setting wins */
6222 if (c->mount_apivfs_set)
6223 return c->mount_apivfs;
6224
6225 /* Default to "yes" if root directory or image are specified */
6226 if (exec_context_with_rootfs(c))
6227 return true;
6228
6229 return false;
6230 }
6231
6232 void exec_context_free_log_extra_fields(ExecContext *c) {
6233 assert(c);
6234
6235 for (size_t l = 0; l < c->n_log_extra_fields; l++)
6236 free(c->log_extra_fields[l].iov_base);
6237 c->log_extra_fields = mfree(c->log_extra_fields);
6238 c->n_log_extra_fields = 0;
6239 }
6240
6241 void exec_context_revert_tty(ExecContext *c) {
6242 _cleanup_close_ int fd = -EBADF;
6243 const char *path;
6244 struct stat st;
6245 int r;
6246
6247 assert(c);
6248
6249 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6250 exec_context_tty_reset(c, NULL);
6251
6252 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6253 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6254 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6255 if (!exec_context_may_touch_tty(c))
6256 return;
6257
6258 path = exec_context_tty_path(c);
6259 if (!path)
6260 return;
6261
6262 fd = open(path, O_PATH|O_CLOEXEC);
6263 if (fd < 0)
6264 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6265 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6266 path);
6267
6268 if (fstat(fd, &st) < 0)
6269 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6270
6271 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6272 * if things are a character device, since a proper check either means we'd have to open the TTY and
6273 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6274 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6275 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6276 if (!S_ISCHR(st.st_mode))
6277 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6278
6279 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6280 if (r < 0)
6281 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6282 }
6283
6284 int exec_context_get_clean_directories(
6285 ExecContext *c,
6286 char **prefix,
6287 ExecCleanMask mask,
6288 char ***ret) {
6289
6290 _cleanup_strv_free_ char **l = NULL;
6291 int r;
6292
6293 assert(c);
6294 assert(prefix);
6295 assert(ret);
6296
6297 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6298 if (!FLAGS_SET(mask, 1U << t))
6299 continue;
6300
6301 if (!prefix[t])
6302 continue;
6303
6304 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6305 char *j;
6306
6307 j = path_join(prefix[t], c->directories[t].items[i].path);
6308 if (!j)
6309 return -ENOMEM;
6310
6311 r = strv_consume(&l, j);
6312 if (r < 0)
6313 return r;
6314
6315 /* Also remove private directories unconditionally. */
6316 if (t != EXEC_DIRECTORY_CONFIGURATION) {
6317 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6318 if (!j)
6319 return -ENOMEM;
6320
6321 r = strv_consume(&l, j);
6322 if (r < 0)
6323 return r;
6324 }
6325
6326 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6327 j = path_join(prefix[t], *symlink);
6328 if (!j)
6329 return -ENOMEM;
6330
6331 r = strv_consume(&l, j);
6332 if (r < 0)
6333 return r;
6334 }
6335 }
6336 }
6337
6338 *ret = TAKE_PTR(l);
6339 return 0;
6340 }
6341
6342 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6343 ExecCleanMask mask = 0;
6344
6345 assert(c);
6346 assert(ret);
6347
6348 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6349 if (c->directories[t].n_items > 0)
6350 mask |= 1U << t;
6351
6352 *ret = mask;
6353 return 0;
6354 }
6355
6356 void exec_status_start(ExecStatus *s, pid_t pid) {
6357 assert(s);
6358
6359 *s = (ExecStatus) {
6360 .pid = pid,
6361 };
6362
6363 dual_timestamp_get(&s->start_timestamp);
6364 }
6365
6366 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6367 assert(s);
6368
6369 if (s->pid != pid)
6370 *s = (ExecStatus) {
6371 .pid = pid,
6372 };
6373
6374 dual_timestamp_get(&s->exit_timestamp);
6375
6376 s->code = code;
6377 s->status = status;
6378
6379 if (context && context->utmp_id)
6380 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6381 }
6382
6383 void exec_status_reset(ExecStatus *s) {
6384 assert(s);
6385
6386 *s = (ExecStatus) {};
6387 }
6388
6389 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6390 assert(s);
6391 assert(f);
6392
6393 if (s->pid <= 0)
6394 return;
6395
6396 prefix = strempty(prefix);
6397
6398 fprintf(f,
6399 "%sPID: "PID_FMT"\n",
6400 prefix, s->pid);
6401
6402 if (dual_timestamp_is_set(&s->start_timestamp))
6403 fprintf(f,
6404 "%sStart Timestamp: %s\n",
6405 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6406
6407 if (dual_timestamp_is_set(&s->exit_timestamp))
6408 fprintf(f,
6409 "%sExit Timestamp: %s\n"
6410 "%sExit Code: %s\n"
6411 "%sExit Status: %i\n",
6412 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6413 prefix, sigchld_code_to_string(s->code),
6414 prefix, s->status);
6415 }
6416
6417 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6418 _cleanup_free_ char *cmd = NULL;
6419 const char *prefix2;
6420
6421 assert(c);
6422 assert(f);
6423
6424 prefix = strempty(prefix);
6425 prefix2 = strjoina(prefix, "\t");
6426
6427 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6428
6429 fprintf(f,
6430 "%sCommand Line: %s\n",
6431 prefix, strnull(cmd));
6432
6433 exec_status_dump(&c->exec_status, f, prefix2);
6434 }
6435
6436 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6437 assert(f);
6438
6439 prefix = strempty(prefix);
6440
6441 LIST_FOREACH(command, i, c)
6442 exec_command_dump(i, f, prefix);
6443 }
6444
6445 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6446 ExecCommand *end;
6447
6448 assert(l);
6449 assert(e);
6450
6451 if (*l) {
6452 /* It's kind of important, that we keep the order here */
6453 end = LIST_FIND_TAIL(command, *l);
6454 LIST_INSERT_AFTER(command, *l, end, e);
6455 } else
6456 *l = e;
6457 }
6458
6459 int exec_command_set(ExecCommand *c, const char *path, ...) {
6460 va_list ap;
6461 char **l, *p;
6462
6463 assert(c);
6464 assert(path);
6465
6466 va_start(ap, path);
6467 l = strv_new_ap(path, ap);
6468 va_end(ap);
6469
6470 if (!l)
6471 return -ENOMEM;
6472
6473 p = strdup(path);
6474 if (!p) {
6475 strv_free(l);
6476 return -ENOMEM;
6477 }
6478
6479 free_and_replace(c->path, p);
6480
6481 return strv_free_and_replace(c->argv, l);
6482 }
6483
6484 int exec_command_append(ExecCommand *c, const char *path, ...) {
6485 _cleanup_strv_free_ char **l = NULL;
6486 va_list ap;
6487 int r;
6488
6489 assert(c);
6490 assert(path);
6491
6492 va_start(ap, path);
6493 l = strv_new_ap(path, ap);
6494 va_end(ap);
6495
6496 if (!l)
6497 return -ENOMEM;
6498
6499 r = strv_extend_strv(&c->argv, l, false);
6500 if (r < 0)
6501 return r;
6502
6503 return 0;
6504 }
6505
6506 static char *destroy_tree(char *path) {
6507 if (!path)
6508 return NULL;
6509
6510 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6511 log_debug("Spawning process to nuke '%s'", path);
6512
6513 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6514 }
6515
6516 return mfree(path);
6517 }
6518
6519 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
6520 if (!rt)
6521 return NULL;
6522
6523 if (rt->manager)
6524 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
6525
6526 rt->id = mfree(rt->id);
6527 rt->tmp_dir = mfree(rt->tmp_dir);
6528 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6529 safe_close_pair(rt->netns_storage_socket);
6530 safe_close_pair(rt->ipcns_storage_socket);
6531 return mfree(rt);
6532 }
6533
6534 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6535 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6536
6537 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
6538 if (!rt)
6539 return NULL;
6540
6541 assert(rt->n_ref > 0);
6542 rt->n_ref--;
6543
6544 if (rt->n_ref > 0)
6545 return NULL;
6546
6547 rt->tmp_dir = destroy_tree(rt->tmp_dir);
6548 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
6549
6550 return exec_shared_runtime_free(rt);
6551 }
6552
6553 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
6554 _cleanup_free_ char *id_copy = NULL;
6555 ExecSharedRuntime *n;
6556
6557 assert(ret);
6558
6559 id_copy = strdup(id);
6560 if (!id_copy)
6561 return -ENOMEM;
6562
6563 n = new(ExecSharedRuntime, 1);
6564 if (!n)
6565 return -ENOMEM;
6566
6567 *n = (ExecSharedRuntime) {
6568 .id = TAKE_PTR(id_copy),
6569 .netns_storage_socket = PIPE_EBADF,
6570 .ipcns_storage_socket = PIPE_EBADF,
6571 };
6572
6573 *ret = n;
6574 return 0;
6575 }
6576
6577 static int exec_shared_runtime_add(
6578 Manager *m,
6579 const char *id,
6580 char **tmp_dir,
6581 char **var_tmp_dir,
6582 int netns_storage_socket[2],
6583 int ipcns_storage_socket[2],
6584 ExecSharedRuntime **ret) {
6585
6586 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
6587 int r;
6588
6589 assert(m);
6590 assert(id);
6591
6592 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6593
6594 r = exec_shared_runtime_allocate(&rt, id);
6595 if (r < 0)
6596 return r;
6597
6598 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
6599 if (r < 0)
6600 return r;
6601
6602 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6603 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6604 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6605
6606 if (netns_storage_socket) {
6607 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6608 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6609 }
6610
6611 if (ipcns_storage_socket) {
6612 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6613 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6614 }
6615
6616 rt->manager = m;
6617
6618 if (ret)
6619 *ret = rt;
6620 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
6621 TAKE_PTR(rt);
6622 return 0;
6623 }
6624
6625 static int exec_shared_runtime_make(
6626 Manager *m,
6627 const ExecContext *c,
6628 const char *id,
6629 ExecSharedRuntime **ret) {
6630
6631 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6632 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
6633 int r;
6634
6635 assert(m);
6636 assert(c);
6637 assert(id);
6638
6639 /* It is not necessary to create ExecSharedRuntime object. */
6640 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
6641 *ret = NULL;
6642 return 0;
6643 }
6644
6645 if (c->private_tmp &&
6646 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6647 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6648 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6649 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6650 if (r < 0)
6651 return r;
6652 }
6653
6654 if (exec_needs_network_namespace(c)) {
6655 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6656 return -errno;
6657 }
6658
6659 if (exec_needs_ipc_namespace(c)) {
6660 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6661 return -errno;
6662 }
6663
6664 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6665 if (r < 0)
6666 return r;
6667
6668 return 1;
6669 }
6670
6671 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6672 ExecSharedRuntime *rt;
6673 int r;
6674
6675 assert(m);
6676 assert(id);
6677 assert(ret);
6678
6679 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
6680 if (rt)
6681 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
6682 goto ref;
6683
6684 if (!create) {
6685 *ret = NULL;
6686 return 0;
6687 }
6688
6689 /* If not found, then create a new object. */
6690 r = exec_shared_runtime_make(m, c, id, &rt);
6691 if (r < 0)
6692 return r;
6693 if (r == 0) {
6694 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
6695 *ret = NULL;
6696 return 0;
6697 }
6698
6699 ref:
6700 /* increment reference counter. */
6701 rt->n_ref++;
6702 *ret = rt;
6703 return 1;
6704 }
6705
6706 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6707 ExecSharedRuntime *rt;
6708
6709 assert(m);
6710 assert(f);
6711 assert(fds);
6712
6713 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6714 fprintf(f, "exec-runtime=%s", rt->id);
6715
6716 if (rt->tmp_dir)
6717 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6718
6719 if (rt->var_tmp_dir)
6720 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6721
6722 if (rt->netns_storage_socket[0] >= 0) {
6723 int copy;
6724
6725 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6726 if (copy < 0)
6727 return copy;
6728
6729 fprintf(f, " netns-socket-0=%i", copy);
6730 }
6731
6732 if (rt->netns_storage_socket[1] >= 0) {
6733 int copy;
6734
6735 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6736 if (copy < 0)
6737 return copy;
6738
6739 fprintf(f, " netns-socket-1=%i", copy);
6740 }
6741
6742 if (rt->ipcns_storage_socket[0] >= 0) {
6743 int copy;
6744
6745 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6746 if (copy < 0)
6747 return copy;
6748
6749 fprintf(f, " ipcns-socket-0=%i", copy);
6750 }
6751
6752 if (rt->ipcns_storage_socket[1] >= 0) {
6753 int copy;
6754
6755 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6756 if (copy < 0)
6757 return copy;
6758
6759 fprintf(f, " ipcns-socket-1=%i", copy);
6760 }
6761
6762 fputc('\n', f);
6763 }
6764
6765 return 0;
6766 }
6767
6768 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6769 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
6770 ExecSharedRuntime *rt;
6771 int r;
6772
6773 /* This is for the migration from old (v237 or earlier) deserialization text.
6774 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6775 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
6776 * so or not from the serialized text, then we always creates a new object owned by this. */
6777
6778 assert(u);
6779 assert(key);
6780 assert(value);
6781
6782 /* Manager manages ExecSharedRuntime objects by the unit id.
6783 * So, we omit the serialized text when the unit does not have id (yet?)... */
6784 if (isempty(u->id)) {
6785 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6786 return 0;
6787 }
6788
6789 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
6790 return log_oom();
6791
6792 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
6793 if (!rt) {
6794 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
6795 return log_oom();
6796
6797 rt = rt_create;
6798 }
6799
6800 if (streq(key, "tmp-dir")) {
6801 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6802 return -ENOMEM;
6803
6804 } else if (streq(key, "var-tmp-dir")) {
6805 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6806 return -ENOMEM;
6807
6808 } else if (streq(key, "netns-socket-0")) {
6809 int fd;
6810
6811 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6812 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6813 return 0;
6814 }
6815
6816 safe_close(rt->netns_storage_socket[0]);
6817 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6818
6819 } else if (streq(key, "netns-socket-1")) {
6820 int fd;
6821
6822 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6823 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6824 return 0;
6825 }
6826
6827 safe_close(rt->netns_storage_socket[1]);
6828 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6829
6830 } else
6831 return 0;
6832
6833 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
6834 if (rt_create) {
6835 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
6836 if (r < 0) {
6837 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6838 return 0;
6839 }
6840
6841 rt_create->manager = u->manager;
6842
6843 /* Avoid cleanup */
6844 TAKE_PTR(rt_create);
6845 }
6846
6847 return 1;
6848 }
6849
6850 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6851 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6852 char *id = NULL;
6853 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6854 const char *p, *v = ASSERT_PTR(value);
6855 size_t n;
6856
6857 assert(m);
6858 assert(fds);
6859
6860 n = strcspn(v, " ");
6861 id = strndupa_safe(v, n);
6862 if (v[n] != ' ')
6863 goto finalize;
6864 p = v + n + 1;
6865
6866 v = startswith(p, "tmp-dir=");
6867 if (v) {
6868 n = strcspn(v, " ");
6869 tmp_dir = strndup(v, n);
6870 if (!tmp_dir)
6871 return log_oom();
6872 if (v[n] != ' ')
6873 goto finalize;
6874 p = v + n + 1;
6875 }
6876
6877 v = startswith(p, "var-tmp-dir=");
6878 if (v) {
6879 n = strcspn(v, " ");
6880 var_tmp_dir = strndup(v, n);
6881 if (!var_tmp_dir)
6882 return log_oom();
6883 if (v[n] != ' ')
6884 goto finalize;
6885 p = v + n + 1;
6886 }
6887
6888 v = startswith(p, "netns-socket-0=");
6889 if (v) {
6890 char *buf;
6891
6892 n = strcspn(v, " ");
6893 buf = strndupa_safe(v, n);
6894
6895 netns_fdpair[0] = parse_fd(buf);
6896 if (netns_fdpair[0] < 0)
6897 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6898 if (!fdset_contains(fds, netns_fdpair[0]))
6899 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6900 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6901 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6902 if (v[n] != ' ')
6903 goto finalize;
6904 p = v + n + 1;
6905 }
6906
6907 v = startswith(p, "netns-socket-1=");
6908 if (v) {
6909 char *buf;
6910
6911 n = strcspn(v, " ");
6912 buf = strndupa_safe(v, n);
6913
6914 netns_fdpair[1] = parse_fd(buf);
6915 if (netns_fdpair[1] < 0)
6916 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6917 if (!fdset_contains(fds, netns_fdpair[1]))
6918 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6919 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6920 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6921 if (v[n] != ' ')
6922 goto finalize;
6923 p = v + n + 1;
6924 }
6925
6926 v = startswith(p, "ipcns-socket-0=");
6927 if (v) {
6928 char *buf;
6929
6930 n = strcspn(v, " ");
6931 buf = strndupa_safe(v, n);
6932
6933 ipcns_fdpair[0] = parse_fd(buf);
6934 if (ipcns_fdpair[0] < 0)
6935 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6936 if (!fdset_contains(fds, ipcns_fdpair[0]))
6937 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6938 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6939 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6940 if (v[n] != ' ')
6941 goto finalize;
6942 p = v + n + 1;
6943 }
6944
6945 v = startswith(p, "ipcns-socket-1=");
6946 if (v) {
6947 char *buf;
6948
6949 n = strcspn(v, " ");
6950 buf = strndupa_safe(v, n);
6951
6952 ipcns_fdpair[1] = parse_fd(buf);
6953 if (ipcns_fdpair[1] < 0)
6954 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6955 if (!fdset_contains(fds, ipcns_fdpair[1]))
6956 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6957 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6958 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6959 }
6960
6961 finalize:
6962 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6963 if (r < 0)
6964 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6965 return 0;
6966 }
6967
6968 void exec_shared_runtime_vacuum(Manager *m) {
6969 ExecSharedRuntime *rt;
6970
6971 assert(m);
6972
6973 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
6974
6975 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6976 if (rt->n_ref > 0)
6977 continue;
6978
6979 (void) exec_shared_runtime_free(rt);
6980 }
6981 }
6982
6983 int exec_runtime_make(
6984 const Unit *unit,
6985 const ExecContext *context,
6986 ExecSharedRuntime *shared,
6987 DynamicCreds *creds,
6988 ExecRuntime **ret) {
6989 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
6990 _cleanup_free_ char *ephemeral = NULL;
6991 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6992 int r;
6993
6994 assert(unit);
6995 assert(context);
6996 assert(ret);
6997
6998 if (!shared && !creds && !exec_needs_ephemeral(context)) {
6999 *ret = NULL;
7000 return 0;
7001 }
7002
7003 if (exec_needs_ephemeral(context)) {
7004 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7005 if (r < 0)
7006 return r;
7007
7008 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7009 if (r < 0)
7010 return r;
7011
7012 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7013 return -errno;
7014 }
7015
7016 rt = new(ExecRuntime, 1);
7017 if (!rt)
7018 return -ENOMEM;
7019
7020 *rt = (ExecRuntime) {
7021 .shared = shared,
7022 .dynamic_creds = creds,
7023 .ephemeral_copy = TAKE_PTR(ephemeral),
7024 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7025 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7026 };
7027
7028 *ret = TAKE_PTR(rt);
7029 return 1;
7030 }
7031
7032 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7033 if (!rt)
7034 return NULL;
7035
7036 exec_shared_runtime_unref(rt->shared);
7037 dynamic_creds_unref(rt->dynamic_creds);
7038
7039 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7040
7041 safe_close_pair(rt->ephemeral_storage_socket);
7042 return mfree(rt);
7043 }
7044
7045 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7046 if (!rt)
7047 return NULL;
7048
7049 rt->shared = exec_shared_runtime_destroy(rt->shared);
7050 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7051 return exec_runtime_free(rt);
7052 }
7053
7054 void exec_params_clear(ExecParameters *p) {
7055 if (!p)
7056 return;
7057
7058 p->environment = strv_free(p->environment);
7059 p->fd_names = strv_free(p->fd_names);
7060 p->fds = mfree(p->fds);
7061 p->exec_fd = safe_close(p->exec_fd);
7062 }
7063
7064 void exec_directory_done(ExecDirectory *d) {
7065 if (!d)
7066 return;
7067
7068 for (size_t i = 0; i < d->n_items; i++) {
7069 free(d->items[i].path);
7070 strv_free(d->items[i].symlinks);
7071 }
7072
7073 d->items = mfree(d->items);
7074 d->n_items = 0;
7075 d->mode = 0755;
7076 }
7077
7078 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7079 assert(d);
7080 assert(path);
7081
7082 for (size_t i = 0; i < d->n_items; i++)
7083 if (path_equal(d->items[i].path, path))
7084 return &d->items[i];
7085
7086 return NULL;
7087 }
7088
7089 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7090 _cleanup_strv_free_ char **s = NULL;
7091 _cleanup_free_ char *p = NULL;
7092 ExecDirectoryItem *existing;
7093 int r;
7094
7095 assert(d);
7096 assert(path);
7097
7098 existing = exec_directory_find(d, path);
7099 if (existing) {
7100 r = strv_extend(&existing->symlinks, symlink);
7101 if (r < 0)
7102 return r;
7103
7104 return 0; /* existing item is updated */
7105 }
7106
7107 p = strdup(path);
7108 if (!p)
7109 return -ENOMEM;
7110
7111 if (symlink) {
7112 s = strv_new(symlink);
7113 if (!s)
7114 return -ENOMEM;
7115 }
7116
7117 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7118 return -ENOMEM;
7119
7120 d->items[d->n_items++] = (ExecDirectoryItem) {
7121 .path = TAKE_PTR(p),
7122 .symlinks = TAKE_PTR(s),
7123 };
7124
7125 return 1; /* new item is added */
7126 }
7127
7128 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7129 assert(a);
7130 assert(b);
7131
7132 return path_compare(a->path, b->path);
7133 }
7134
7135 void exec_directory_sort(ExecDirectory *d) {
7136 assert(d);
7137
7138 /* Sort the exec directories to make always parent directories processed at first in
7139 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7140 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7141 * list. See also comments in setup_exec_directory() and issue #24783. */
7142
7143 if (d->n_items <= 1)
7144 return;
7145
7146 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7147
7148 for (size_t i = 1; i < d->n_items; i++)
7149 for (size_t j = 0; j < i; j++)
7150 if (path_startswith(d->items[i].path, d->items[j].path)) {
7151 d->items[i].only_create = true;
7152 break;
7153 }
7154 }
7155
7156 ExecCleanMask exec_clean_mask_from_string(const char *s) {
7157 ExecDirectoryType t;
7158
7159 assert(s);
7160
7161 if (streq(s, "all"))
7162 return EXEC_CLEAN_ALL;
7163 if (streq(s, "fdstore"))
7164 return EXEC_CLEAN_FDSTORE;
7165
7166 t = exec_resource_type_from_string(s);
7167 if (t < 0)
7168 return (ExecCleanMask) t;
7169
7170 return 1U << t;
7171 }
7172
7173 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7174 [EXEC_INPUT_NULL] = "null",
7175 [EXEC_INPUT_TTY] = "tty",
7176 [EXEC_INPUT_TTY_FORCE] = "tty-force",
7177 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7178 [EXEC_INPUT_SOCKET] = "socket",
7179 [EXEC_INPUT_NAMED_FD] = "fd",
7180 [EXEC_INPUT_DATA] = "data",
7181 [EXEC_INPUT_FILE] = "file",
7182 };
7183
7184 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7185
7186 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7187 [EXEC_OUTPUT_INHERIT] = "inherit",
7188 [EXEC_OUTPUT_NULL] = "null",
7189 [EXEC_OUTPUT_TTY] = "tty",
7190 [EXEC_OUTPUT_KMSG] = "kmsg",
7191 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7192 [EXEC_OUTPUT_JOURNAL] = "journal",
7193 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7194 [EXEC_OUTPUT_SOCKET] = "socket",
7195 [EXEC_OUTPUT_NAMED_FD] = "fd",
7196 [EXEC_OUTPUT_FILE] = "file",
7197 [EXEC_OUTPUT_FILE_APPEND] = "append",
7198 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7199 };
7200
7201 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7202
7203 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7204 [EXEC_UTMP_INIT] = "init",
7205 [EXEC_UTMP_LOGIN] = "login",
7206 [EXEC_UTMP_USER] = "user",
7207 };
7208
7209 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7210
7211 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7212 [EXEC_PRESERVE_NO] = "no",
7213 [EXEC_PRESERVE_YES] = "yes",
7214 [EXEC_PRESERVE_RESTART] = "restart",
7215 };
7216
7217 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7218
7219 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7220 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7221 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7222 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7223 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7224 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7225 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7226 };
7227
7228 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7229
7230 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7231 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7232 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7233 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7234 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7235 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7236 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7237 };
7238
7239 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7240
7241 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7242 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7243 * directories, specifically .timer units with their timestamp touch file. */
7244 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7245 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7246 [EXEC_DIRECTORY_STATE] = "state",
7247 [EXEC_DIRECTORY_CACHE] = "cache",
7248 [EXEC_DIRECTORY_LOGS] = "logs",
7249 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7250 };
7251
7252 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7253
7254 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7255 * the service payload in. */
7256 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7257 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7258 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7259 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7260 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7261 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7262 };
7263
7264 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7265
7266 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7267 [EXEC_KEYRING_INHERIT] = "inherit",
7268 [EXEC_KEYRING_PRIVATE] = "private",
7269 [EXEC_KEYRING_SHARED] = "shared",
7270 };
7271
7272 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);