]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #29537 from poettering/varlink-tweaks
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/file.h>
8 #include <sys/ioctl.h>
9 #include <sys/mman.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
20 #if HAVE_PAM
21 #include <security/pam_appl.h>
22 #endif
23
24 #if HAVE_SELINUX
25 #include <selinux/selinux.h>
26 #endif
27
28 #if HAVE_APPARMOR
29 #include <sys/apparmor.h>
30 #endif
31
32 #include "sd-messages.h"
33
34 #include "af-list.h"
35 #include "alloc-util.h"
36 #if HAVE_APPARMOR
37 #include "apparmor-util.h"
38 #endif
39 #include "argv-util.h"
40 #include "async.h"
41 #include "barrier.h"
42 #include "bpf-lsm.h"
43 #include "btrfs-util.h"
44 #include "cap-list.h"
45 #include "capability-util.h"
46 #include "chattr-util.h"
47 #include "cgroup-setup.h"
48 #include "chase.h"
49 #include "chown-recursive.h"
50 #include "constants.h"
51 #include "cpu-set-util.h"
52 #include "data-fd-util.h"
53 #include "env-file.h"
54 #include "env-util.h"
55 #include "errno-list.h"
56 #include "escape.h"
57 #include "exec-credential.h"
58 #include "execute.h"
59 #include "exit-status.h"
60 #include "fd-util.h"
61 #include "format-util.h"
62 #include "glob-util.h"
63 #include "hexdecoct.h"
64 #include "io-util.h"
65 #include "ioprio-util.h"
66 #include "lock-util.h"
67 #include "log.h"
68 #include "macro.h"
69 #include "manager.h"
70 #include "manager-dump.h"
71 #include "memory-util.h"
72 #include "missing_fs.h"
73 #include "missing_ioprio.h"
74 #include "missing_prctl.h"
75 #include "mkdir-label.h"
76 #include "namespace.h"
77 #include "parse-util.h"
78 #include "path-util.h"
79 #include "proc-cmdline.h"
80 #include "process-util.h"
81 #include "psi-util.h"
82 #include "rlimit-util.h"
83 #include "rm-rf.h"
84 #include "seccomp-util.h"
85 #include "securebits-util.h"
86 #include "selinux-util.h"
87 #include "signal-util.h"
88 #include "smack-util.h"
89 #include "socket-util.h"
90 #include "sort-util.h"
91 #include "special.h"
92 #include "stat-util.h"
93 #include "string-table.h"
94 #include "string-util.h"
95 #include "strv.h"
96 #include "syslog-util.h"
97 #include "terminal-util.h"
98 #include "tmpfile-util.h"
99 #include "umask-util.h"
100 #include "unit-serialize.h"
101 #include "user-util.h"
102 #include "utmp-wtmp.h"
103
104 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
105 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
106
107 #define SNDBUF_SIZE (8*1024*1024)
108
109 static int shift_fds(int fds[], size_t n_fds) {
110 if (n_fds <= 0)
111 return 0;
112
113 /* Modifies the fds array! (sorts it) */
114
115 assert(fds);
116
117 for (int start = 0;;) {
118 int restart_from = -1;
119
120 for (int i = start; i < (int) n_fds; i++) {
121 int nfd;
122
123 /* Already at right index? */
124 if (fds[i] == i+3)
125 continue;
126
127 nfd = fcntl(fds[i], F_DUPFD, i + 3);
128 if (nfd < 0)
129 return -errno;
130
131 safe_close(fds[i]);
132 fds[i] = nfd;
133
134 /* Hmm, the fd we wanted isn't free? Then
135 * let's remember that and try again from here */
136 if (nfd != i+3 && restart_from < 0)
137 restart_from = i;
138 }
139
140 if (restart_from < 0)
141 break;
142
143 start = restart_from;
144 }
145
146 return 0;
147 }
148
149 static int flags_fds(
150 const int fds[],
151 size_t n_socket_fds,
152 size_t n_fds,
153 bool nonblock) {
154
155 int r;
156
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
164
165 for (size_t i = 0; i < n_fds; i++) {
166
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
172
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
176
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
179 return r;
180 }
181
182 return 0;
183 }
184
185 static const char *exec_context_tty_path(const ExecContext *context) {
186 assert(context);
187
188 if (context->stdio_as_fds)
189 return NULL;
190
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195 }
196
197 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
198 unsigned rows, cols;
199 const char *tty;
200
201 assert(context);
202 assert(ret_rows);
203 assert(ret_cols);
204
205 rows = context->tty_rows;
206 cols = context->tty_cols;
207
208 tty = exec_context_tty_path(context);
209 if (tty)
210 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
211
212 *ret_rows = rows;
213 *ret_cols = cols;
214
215 return 0;
216 }
217
218 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
219 _cleanup_close_ int fd = -EBADF;
220 const char *path = exec_context_tty_path(ASSERT_PTR(context));
221
222 /* Take a lock around the device for the duration of the setup that we do here.
223 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
224 * We open a new fd that will be closed automatically, and operate on it for convenience.
225 */
226
227 if (p && p->stdin_fd >= 0) {
228 fd = xopenat_lock(p->stdin_fd, NULL,
229 O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
230 if (fd < 0)
231 return;
232 } else if (path) {
233 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
234 if (fd < 0)
235 return;
236
237 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
238 return;
239 } else
240 return; /* nothing to do */
241
242 if (context->tty_vhangup)
243 (void) terminal_vhangup_fd(fd);
244
245 if (context->tty_reset)
246 (void) reset_terminal_fd(fd, true);
247
248 if (p && p->stdin_fd >= 0) {
249 unsigned rows = context->tty_rows, cols = context->tty_cols;
250
251 (void) exec_context_tty_size(context, &rows, &cols);
252 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
253 }
254
255 if (context->tty_vt_disallocate && path)
256 (void) vt_disallocate(path);
257 }
258
259 static bool is_terminal_input(ExecInput i) {
260 return IN_SET(i,
261 EXEC_INPUT_TTY,
262 EXEC_INPUT_TTY_FORCE,
263 EXEC_INPUT_TTY_FAIL);
264 }
265
266 static bool is_terminal_output(ExecOutput o) {
267 return IN_SET(o,
268 EXEC_OUTPUT_TTY,
269 EXEC_OUTPUT_KMSG_AND_CONSOLE,
270 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
271 }
272
273 static bool is_kmsg_output(ExecOutput o) {
274 return IN_SET(o,
275 EXEC_OUTPUT_KMSG,
276 EXEC_OUTPUT_KMSG_AND_CONSOLE);
277 }
278
279 static bool exec_context_needs_term(const ExecContext *c) {
280 assert(c);
281
282 /* Return true if the execution context suggests we should set $TERM to something useful. */
283
284 if (is_terminal_input(c->std_input))
285 return true;
286
287 if (is_terminal_output(c->std_output))
288 return true;
289
290 if (is_terminal_output(c->std_error))
291 return true;
292
293 return !!c->tty_path;
294 }
295
296 static int open_null_as(int flags, int nfd) {
297 int fd;
298
299 assert(nfd >= 0);
300
301 fd = open("/dev/null", flags|O_NOCTTY);
302 if (fd < 0)
303 return -errno;
304
305 return move_fd(fd, nfd, false);
306 }
307
308 static int connect_journal_socket(
309 int fd,
310 const char *log_namespace,
311 uid_t uid,
312 gid_t gid) {
313
314 uid_t olduid = UID_INVALID;
315 gid_t oldgid = GID_INVALID;
316 const char *j;
317 int r;
318
319 j = log_namespace ?
320 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
321 "/run/systemd/journal/stdout";
322
323 if (gid_is_valid(gid)) {
324 oldgid = getgid();
325
326 if (setegid(gid) < 0)
327 return -errno;
328 }
329
330 if (uid_is_valid(uid)) {
331 olduid = getuid();
332
333 if (seteuid(uid) < 0) {
334 r = -errno;
335 goto restore_gid;
336 }
337 }
338
339 r = connect_unix_path(fd, AT_FDCWD, j);
340
341 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
342 an LSM interferes. */
343
344 if (uid_is_valid(uid))
345 (void) seteuid(olduid);
346
347 restore_gid:
348 if (gid_is_valid(gid))
349 (void) setegid(oldgid);
350
351 return r;
352 }
353
354 static int connect_logger_as(
355 const Unit *unit,
356 const ExecContext *context,
357 const ExecParameters *params,
358 ExecOutput output,
359 const char *ident,
360 int nfd,
361 uid_t uid,
362 gid_t gid) {
363
364 _cleanup_close_ int fd = -EBADF;
365 int r;
366
367 assert(context);
368 assert(params);
369 assert(output < _EXEC_OUTPUT_MAX);
370 assert(ident);
371 assert(nfd >= 0);
372
373 fd = socket(AF_UNIX, SOCK_STREAM, 0);
374 if (fd < 0)
375 return -errno;
376
377 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
378 if (r < 0)
379 return r;
380
381 if (shutdown(fd, SHUT_RD) < 0)
382 return -errno;
383
384 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
385
386 if (dprintf(fd,
387 "%s\n"
388 "%s\n"
389 "%i\n"
390 "%i\n"
391 "%i\n"
392 "%i\n"
393 "%i\n",
394 context->syslog_identifier ?: ident,
395 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
396 context->syslog_priority,
397 !!context->syslog_level_prefix,
398 false,
399 is_kmsg_output(output),
400 is_terminal_output(output)) < 0)
401 return -errno;
402
403 return move_fd(TAKE_FD(fd), nfd, false);
404 }
405
406 static int open_terminal_as(const char *path, int flags, int nfd) {
407 int fd;
408
409 assert(path);
410 assert(nfd >= 0);
411
412 fd = open_terminal(path, flags | O_NOCTTY);
413 if (fd < 0)
414 return fd;
415
416 return move_fd(fd, nfd, false);
417 }
418
419 static int acquire_path(const char *path, int flags, mode_t mode) {
420 _cleanup_close_ int fd = -EBADF;
421 int r;
422
423 assert(path);
424
425 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
426 flags |= O_CREAT;
427
428 fd = open(path, flags|O_NOCTTY, mode);
429 if (fd >= 0)
430 return TAKE_FD(fd);
431
432 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
433 return -errno;
434
435 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
436
437 fd = socket(AF_UNIX, SOCK_STREAM, 0);
438 if (fd < 0)
439 return -errno;
440
441 r = connect_unix_path(fd, AT_FDCWD, path);
442 if (IN_SET(r, -ENOTSOCK, -EINVAL))
443 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
444 * wasn't an AF_UNIX socket after all */
445 return -ENXIO;
446 if (r < 0)
447 return r;
448
449 if ((flags & O_ACCMODE) == O_RDONLY)
450 r = shutdown(fd, SHUT_WR);
451 else if ((flags & O_ACCMODE) == O_WRONLY)
452 r = shutdown(fd, SHUT_RD);
453 else
454 r = 0;
455 if (r < 0)
456 return -errno;
457
458 return TAKE_FD(fd);
459 }
460
461 static int fixup_input(
462 const ExecContext *context,
463 int socket_fd,
464 bool apply_tty_stdin) {
465
466 ExecInput std_input;
467
468 assert(context);
469
470 std_input = context->std_input;
471
472 if (is_terminal_input(std_input) && !apply_tty_stdin)
473 return EXEC_INPUT_NULL;
474
475 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
476 return EXEC_INPUT_NULL;
477
478 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
479 return EXEC_INPUT_NULL;
480
481 return std_input;
482 }
483
484 static int fixup_output(ExecOutput output, int socket_fd) {
485
486 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
487 return EXEC_OUTPUT_INHERIT;
488
489 return output;
490 }
491
492 static int setup_input(
493 const ExecContext *context,
494 const ExecParameters *params,
495 int socket_fd,
496 const int named_iofds[static 3]) {
497
498 ExecInput i;
499 int r;
500
501 assert(context);
502 assert(params);
503 assert(named_iofds);
504
505 if (params->stdin_fd >= 0) {
506 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
507 return -errno;
508
509 /* Try to make this the controlling tty, if it is a tty, and reset it */
510 if (isatty(STDIN_FILENO)) {
511 unsigned rows = context->tty_rows, cols = context->tty_cols;
512
513 (void) exec_context_tty_size(context, &rows, &cols);
514 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
515 (void) reset_terminal_fd(STDIN_FILENO, true);
516 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
517 }
518
519 return STDIN_FILENO;
520 }
521
522 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
523
524 switch (i) {
525
526 case EXEC_INPUT_NULL:
527 return open_null_as(O_RDONLY, STDIN_FILENO);
528
529 case EXEC_INPUT_TTY:
530 case EXEC_INPUT_TTY_FORCE:
531 case EXEC_INPUT_TTY_FAIL: {
532 unsigned rows, cols;
533 int fd;
534
535 fd = acquire_terminal(exec_context_tty_path(context),
536 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
537 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
538 ACQUIRE_TERMINAL_WAIT,
539 USEC_INFINITY);
540 if (fd < 0)
541 return fd;
542
543 r = exec_context_tty_size(context, &rows, &cols);
544 if (r < 0)
545 return r;
546
547 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
548 if (r < 0)
549 return r;
550
551 return move_fd(fd, STDIN_FILENO, false);
552 }
553
554 case EXEC_INPUT_SOCKET:
555 assert(socket_fd >= 0);
556
557 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
558
559 case EXEC_INPUT_NAMED_FD:
560 assert(named_iofds[STDIN_FILENO] >= 0);
561
562 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
563 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
564
565 case EXEC_INPUT_DATA: {
566 int fd;
567
568 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
569 if (fd < 0)
570 return fd;
571
572 return move_fd(fd, STDIN_FILENO, false);
573 }
574
575 case EXEC_INPUT_FILE: {
576 bool rw;
577 int fd;
578
579 assert(context->stdio_file[STDIN_FILENO]);
580
581 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
582 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
583
584 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
585 if (fd < 0)
586 return fd;
587
588 return move_fd(fd, STDIN_FILENO, false);
589 }
590
591 default:
592 assert_not_reached();
593 }
594 }
595
596 static bool can_inherit_stderr_from_stdout(
597 const ExecContext *context,
598 ExecOutput o,
599 ExecOutput e) {
600
601 assert(context);
602
603 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
604 * stderr fd */
605
606 if (e == EXEC_OUTPUT_INHERIT)
607 return true;
608 if (e != o)
609 return false;
610
611 if (e == EXEC_OUTPUT_NAMED_FD)
612 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
613
614 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
615 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
616
617 return true;
618 }
619
620 static int setup_output(
621 const Unit *unit,
622 const ExecContext *context,
623 const ExecParameters *params,
624 int fileno,
625 int socket_fd,
626 const int named_iofds[static 3],
627 const char *ident,
628 uid_t uid,
629 gid_t gid,
630 dev_t *journal_stream_dev,
631 ino_t *journal_stream_ino) {
632
633 ExecOutput o;
634 ExecInput i;
635 int r;
636
637 assert(unit);
638 assert(context);
639 assert(params);
640 assert(ident);
641 assert(journal_stream_dev);
642 assert(journal_stream_ino);
643
644 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
645
646 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
647 return -errno;
648
649 return STDOUT_FILENO;
650 }
651
652 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
653 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
654 return -errno;
655
656 return STDERR_FILENO;
657 }
658
659 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
660 o = fixup_output(context->std_output, socket_fd);
661
662 if (fileno == STDERR_FILENO) {
663 ExecOutput e;
664 e = fixup_output(context->std_error, socket_fd);
665
666 /* This expects the input and output are already set up */
667
668 /* Don't change the stderr file descriptor if we inherit all
669 * the way and are not on a tty */
670 if (e == EXEC_OUTPUT_INHERIT &&
671 o == EXEC_OUTPUT_INHERIT &&
672 i == EXEC_INPUT_NULL &&
673 !is_terminal_input(context->std_input) &&
674 getppid() != 1)
675 return fileno;
676
677 /* Duplicate from stdout if possible */
678 if (can_inherit_stderr_from_stdout(context, o, e))
679 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
680
681 o = e;
682
683 } else if (o == EXEC_OUTPUT_INHERIT) {
684 /* If input got downgraded, inherit the original value */
685 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
686 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
687
688 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
689 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
690 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
691
692 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
693 if (getppid() != 1)
694 return fileno;
695
696 /* We need to open /dev/null here anew, to get the right access mode. */
697 return open_null_as(O_WRONLY, fileno);
698 }
699
700 switch (o) {
701
702 case EXEC_OUTPUT_NULL:
703 return open_null_as(O_WRONLY, fileno);
704
705 case EXEC_OUTPUT_TTY:
706 if (is_terminal_input(i))
707 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
708
709 /* We don't reset the terminal if this is just about output */
710 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
711
712 case EXEC_OUTPUT_KMSG:
713 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
714 case EXEC_OUTPUT_JOURNAL:
715 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
716 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
717 if (r < 0) {
718 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
719 fileno == STDOUT_FILENO ? "stdout" : "stderr");
720 r = open_null_as(O_WRONLY, fileno);
721 } else {
722 struct stat st;
723
724 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
725 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
726 * services to detect whether they are connected to the journal or not.
727 *
728 * If both stdout and stderr are connected to a stream then let's make sure to store the data
729 * about STDERR as that's usually the best way to do logging. */
730
731 if (fstat(fileno, &st) >= 0 &&
732 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
733 *journal_stream_dev = st.st_dev;
734 *journal_stream_ino = st.st_ino;
735 }
736 }
737 return r;
738
739 case EXEC_OUTPUT_SOCKET:
740 assert(socket_fd >= 0);
741
742 return RET_NERRNO(dup2(socket_fd, fileno));
743
744 case EXEC_OUTPUT_NAMED_FD:
745 assert(named_iofds[fileno] >= 0);
746
747 (void) fd_nonblock(named_iofds[fileno], false);
748 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
749
750 case EXEC_OUTPUT_FILE:
751 case EXEC_OUTPUT_FILE_APPEND:
752 case EXEC_OUTPUT_FILE_TRUNCATE: {
753 bool rw;
754 int fd, flags;
755
756 assert(context->stdio_file[fileno]);
757
758 rw = context->std_input == EXEC_INPUT_FILE &&
759 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
760
761 if (rw)
762 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
763
764 flags = O_WRONLY;
765 if (o == EXEC_OUTPUT_FILE_APPEND)
766 flags |= O_APPEND;
767 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
768 flags |= O_TRUNC;
769
770 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
771 if (fd < 0)
772 return fd;
773
774 return move_fd(fd, fileno, 0);
775 }
776
777 default:
778 assert_not_reached();
779 }
780 }
781
782 static int chown_terminal(int fd, uid_t uid) {
783 int r;
784
785 assert(fd >= 0);
786
787 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
788 if (isatty(fd) < 1) {
789 if (IN_SET(errno, EINVAL, ENOTTY))
790 return 0; /* not a tty */
791
792 return -errno;
793 }
794
795 /* This might fail. What matters are the results. */
796 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
797 if (r < 0)
798 return r;
799
800 return 1;
801 }
802
803 static int setup_confirm_stdio(
804 const ExecContext *context,
805 const char *vc,
806 int *ret_saved_stdin,
807 int *ret_saved_stdout) {
808
809 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
810 unsigned rows, cols;
811 int r;
812
813 assert(ret_saved_stdin);
814 assert(ret_saved_stdout);
815
816 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
817 if (saved_stdin < 0)
818 return -errno;
819
820 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
821 if (saved_stdout < 0)
822 return -errno;
823
824 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
825 if (fd < 0)
826 return fd;
827
828 r = chown_terminal(fd, getuid());
829 if (r < 0)
830 return r;
831
832 r = reset_terminal_fd(fd, true);
833 if (r < 0)
834 return r;
835
836 r = exec_context_tty_size(context, &rows, &cols);
837 if (r < 0)
838 return r;
839
840 r = terminal_set_size_fd(fd, vc, rows, cols);
841 if (r < 0)
842 return r;
843
844 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
845 TAKE_FD(fd);
846 if (r < 0)
847 return r;
848
849 *ret_saved_stdin = TAKE_FD(saved_stdin);
850 *ret_saved_stdout = TAKE_FD(saved_stdout);
851 return 0;
852 }
853
854 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
855 assert(err < 0);
856
857 if (err == -ETIMEDOUT)
858 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
859 else {
860 errno = -err;
861 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
862 }
863 }
864
865 static void write_confirm_error(int err, const char *vc, const Unit *u) {
866 _cleanup_close_ int fd = -EBADF;
867
868 assert(vc);
869
870 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
871 if (fd < 0)
872 return;
873
874 write_confirm_error_fd(err, fd, u);
875 }
876
877 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
878 int r = 0;
879
880 assert(saved_stdin);
881 assert(saved_stdout);
882
883 release_terminal();
884
885 if (*saved_stdin >= 0)
886 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
887 r = -errno;
888
889 if (*saved_stdout >= 0)
890 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
891 r = -errno;
892
893 *saved_stdin = safe_close(*saved_stdin);
894 *saved_stdout = safe_close(*saved_stdout);
895
896 return r;
897 }
898
899 enum {
900 CONFIRM_PRETEND_FAILURE = -1,
901 CONFIRM_PRETEND_SUCCESS = 0,
902 CONFIRM_EXECUTE = 1,
903 };
904
905 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
906 int saved_stdout = -1, saved_stdin = -1, r;
907 _cleanup_free_ char *e = NULL;
908 char c;
909
910 /* For any internal errors, assume a positive response. */
911 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
912 if (r < 0) {
913 write_confirm_error(r, vc, u);
914 return CONFIRM_EXECUTE;
915 }
916
917 /* confirm_spawn might have been disabled while we were sleeping. */
918 if (manager_is_confirm_spawn_disabled(u->manager)) {
919 r = 1;
920 goto restore_stdio;
921 }
922
923 e = ellipsize(cmdline, 60, 100);
924 if (!e) {
925 log_oom();
926 r = CONFIRM_EXECUTE;
927 goto restore_stdio;
928 }
929
930 for (;;) {
931 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
932 if (r < 0) {
933 write_confirm_error_fd(r, STDOUT_FILENO, u);
934 r = CONFIRM_EXECUTE;
935 goto restore_stdio;
936 }
937
938 switch (c) {
939 case 'c':
940 printf("Resuming normal execution.\n");
941 manager_disable_confirm_spawn();
942 r = 1;
943 break;
944 case 'D':
945 unit_dump(u, stdout, " ");
946 continue; /* ask again */
947 case 'f':
948 printf("Failing execution.\n");
949 r = CONFIRM_PRETEND_FAILURE;
950 break;
951 case 'h':
952 printf(" c - continue, proceed without asking anymore\n"
953 " D - dump, show the state of the unit\n"
954 " f - fail, don't execute the command and pretend it failed\n"
955 " h - help\n"
956 " i - info, show a short summary of the unit\n"
957 " j - jobs, show jobs that are in progress\n"
958 " s - skip, don't execute the command and pretend it succeeded\n"
959 " y - yes, execute the command\n");
960 continue; /* ask again */
961 case 'i':
962 printf(" Description: %s\n"
963 " Unit: %s\n"
964 " Command: %s\n",
965 u->id, u->description, cmdline);
966 continue; /* ask again */
967 case 'j':
968 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
969 continue; /* ask again */
970 case 'n':
971 /* 'n' was removed in favor of 'f'. */
972 printf("Didn't understand 'n', did you mean 'f'?\n");
973 continue; /* ask again */
974 case 's':
975 printf("Skipping execution.\n");
976 r = CONFIRM_PRETEND_SUCCESS;
977 break;
978 case 'y':
979 r = CONFIRM_EXECUTE;
980 break;
981 default:
982 assert_not_reached();
983 }
984 break;
985 }
986
987 restore_stdio:
988 restore_confirm_stdio(&saved_stdin, &saved_stdout);
989 return r;
990 }
991
992 static int get_fixed_user(
993 const char *username,
994 const char **ret_user,
995 uid_t *ret_uid,
996 gid_t *ret_gid,
997 const char **ret_home,
998 const char **ret_shell) {
999
1000 int r;
1001
1002 assert(username);
1003 assert(ret_user);
1004
1005 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1006 * (i.e. are "/" or "/bin/nologin"). */
1007
1008 r = get_user_creds(&username, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
1009 if (r < 0)
1010 return r;
1011
1012 *ret_user = username;
1013 return 0;
1014 }
1015
1016 static int get_fixed_group(
1017 const char *groupname,
1018 const char **ret_group,
1019 gid_t *ret_gid) {
1020
1021 int r;
1022
1023 assert(groupname);
1024 assert(ret_group);
1025
1026 r = get_group_creds(&groupname, ret_gid, /* flags = */ 0);
1027 if (r < 0)
1028 return r;
1029
1030 *ret_group = groupname;
1031 return 0;
1032 }
1033
1034 static int get_supplementary_groups(const ExecContext *c, const char *user,
1035 const char *group, gid_t gid,
1036 gid_t **supplementary_gids, int *ngids) {
1037 int r, k = 0;
1038 int ngroups_max;
1039 bool keep_groups = false;
1040 gid_t *groups = NULL;
1041 _cleanup_free_ gid_t *l_gids = NULL;
1042
1043 assert(c);
1044
1045 /*
1046 * If user is given, then lookup GID and supplementary groups list.
1047 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1048 * here and as early as possible so we keep the list of supplementary
1049 * groups of the caller.
1050 */
1051 if (user && gid_is_valid(gid) && gid != 0) {
1052 /* First step, initialize groups from /etc/groups */
1053 if (initgroups(user, gid) < 0)
1054 return -errno;
1055
1056 keep_groups = true;
1057 }
1058
1059 if (strv_isempty(c->supplementary_groups))
1060 return 0;
1061
1062 /*
1063 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1064 * be positive, otherwise fail.
1065 */
1066 errno = 0;
1067 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1068 if (ngroups_max <= 0)
1069 return errno_or_else(EOPNOTSUPP);
1070
1071 l_gids = new(gid_t, ngroups_max);
1072 if (!l_gids)
1073 return -ENOMEM;
1074
1075 if (keep_groups) {
1076 /*
1077 * Lookup the list of groups that the user belongs to, we
1078 * avoid NSS lookups here too for gid=0.
1079 */
1080 k = ngroups_max;
1081 if (getgrouplist(user, gid, l_gids, &k) < 0)
1082 return -EINVAL;
1083 } else
1084 k = 0;
1085
1086 STRV_FOREACH(i, c->supplementary_groups) {
1087 const char *g;
1088
1089 if (k >= ngroups_max)
1090 return -E2BIG;
1091
1092 g = *i;
1093 r = get_group_creds(&g, l_gids+k, 0);
1094 if (r < 0)
1095 return r;
1096
1097 k++;
1098 }
1099
1100 /*
1101 * Sets ngids to zero to drop all supplementary groups, happens
1102 * when we are under root and SupplementaryGroups= is empty.
1103 */
1104 if (k == 0) {
1105 *ngids = 0;
1106 return 0;
1107 }
1108
1109 /* Otherwise get the final list of supplementary groups */
1110 groups = memdup(l_gids, sizeof(gid_t) * k);
1111 if (!groups)
1112 return -ENOMEM;
1113
1114 *supplementary_gids = groups;
1115 *ngids = k;
1116
1117 groups = NULL;
1118
1119 return 0;
1120 }
1121
1122 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1123 int r;
1124
1125 /* Handle SupplementaryGroups= if it is not empty */
1126 if (ngids > 0) {
1127 r = maybe_setgroups(ngids, supplementary_gids);
1128 if (r < 0)
1129 return r;
1130 }
1131
1132 if (gid_is_valid(gid)) {
1133 /* Then set our gids */
1134 if (setresgid(gid, gid, gid) < 0)
1135 return -errno;
1136 }
1137
1138 return 0;
1139 }
1140
1141 static int set_securebits(unsigned bits, unsigned mask) {
1142 unsigned applied;
1143 int current;
1144
1145 current = prctl(PR_GET_SECUREBITS);
1146 if (current < 0)
1147 return -errno;
1148
1149 /* Clear all securebits defined in mask and set bits */
1150 applied = ((unsigned) current & ~mask) | bits;
1151 if ((unsigned) current == applied)
1152 return 0;
1153
1154 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1155 return -errno;
1156
1157 return 1;
1158 }
1159
1160 static int enforce_user(
1161 const ExecContext *context,
1162 uid_t uid,
1163 uint64_t capability_ambient_set) {
1164 assert(context);
1165 int r;
1166
1167 if (!uid_is_valid(uid))
1168 return 0;
1169
1170 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1171 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1172 * case. */
1173
1174 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1175
1176 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1177 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1178 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1179 if (r < 0)
1180 return r;
1181 }
1182
1183 /* Second step: actually set the uids */
1184 if (setresuid(uid, uid, uid) < 0)
1185 return -errno;
1186
1187 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1188 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1189 * outside of this call. */
1190 return 0;
1191 }
1192
1193 #if HAVE_PAM
1194
1195 static int null_conv(
1196 int num_msg,
1197 const struct pam_message **msg,
1198 struct pam_response **resp,
1199 void *appdata_ptr) {
1200
1201 /* We don't support conversations */
1202
1203 return PAM_CONV_ERR;
1204 }
1205
1206 #endif
1207
1208 static int setup_pam(
1209 const char *name,
1210 const char *user,
1211 uid_t uid,
1212 gid_t gid,
1213 const char *tty,
1214 char ***env, /* updated on success */
1215 const int fds[], size_t n_fds) {
1216
1217 #if HAVE_PAM
1218
1219 static const struct pam_conv conv = {
1220 .conv = null_conv,
1221 .appdata_ptr = NULL
1222 };
1223
1224 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1225 _cleanup_strv_free_ char **e = NULL;
1226 pam_handle_t *handle = NULL;
1227 sigset_t old_ss;
1228 int pam_code = PAM_SUCCESS, r;
1229 bool close_session = false;
1230 pid_t pam_pid = 0, parent_pid;
1231 int flags = 0;
1232
1233 assert(name);
1234 assert(user);
1235 assert(env);
1236
1237 /* We set up PAM in the parent process, then fork. The child
1238 * will then stay around until killed via PR_GET_PDEATHSIG or
1239 * systemd via the cgroup logic. It will then remove the PAM
1240 * session again. The parent process will exec() the actual
1241 * daemon. We do things this way to ensure that the main PID
1242 * of the daemon is the one we initially fork()ed. */
1243
1244 r = barrier_create(&barrier);
1245 if (r < 0)
1246 goto fail;
1247
1248 if (log_get_max_level() < LOG_DEBUG)
1249 flags |= PAM_SILENT;
1250
1251 pam_code = pam_start(name, user, &conv, &handle);
1252 if (pam_code != PAM_SUCCESS) {
1253 handle = NULL;
1254 goto fail;
1255 }
1256
1257 if (!tty) {
1258 _cleanup_free_ char *q = NULL;
1259
1260 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1261 * out if that's the case, and read the TTY off it. */
1262
1263 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1264 tty = strjoina("/dev/", q);
1265 }
1266
1267 if (tty) {
1268 pam_code = pam_set_item(handle, PAM_TTY, tty);
1269 if (pam_code != PAM_SUCCESS)
1270 goto fail;
1271 }
1272
1273 STRV_FOREACH(nv, *env) {
1274 pam_code = pam_putenv(handle, *nv);
1275 if (pam_code != PAM_SUCCESS)
1276 goto fail;
1277 }
1278
1279 pam_code = pam_acct_mgmt(handle, flags);
1280 if (pam_code != PAM_SUCCESS)
1281 goto fail;
1282
1283 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1284 if (pam_code != PAM_SUCCESS)
1285 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1286
1287 pam_code = pam_open_session(handle, flags);
1288 if (pam_code != PAM_SUCCESS)
1289 goto fail;
1290
1291 close_session = true;
1292
1293 e = pam_getenvlist(handle);
1294 if (!e) {
1295 pam_code = PAM_BUF_ERR;
1296 goto fail;
1297 }
1298
1299 /* Block SIGTERM, so that we know that it won't get lost in the child */
1300
1301 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1302
1303 parent_pid = getpid_cached();
1304
1305 r = safe_fork("(sd-pam)", 0, &pam_pid);
1306 if (r < 0)
1307 goto fail;
1308 if (r == 0) {
1309 int sig, ret = EXIT_PAM;
1310
1311 /* The child's job is to reset the PAM session on termination */
1312 barrier_set_role(&barrier, BARRIER_CHILD);
1313
1314 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1315 * those fds are open here that have been opened by PAM. */
1316 (void) close_many(fds, n_fds);
1317
1318 /* Drop privileges - we don't need any to pam_close_session and this will make
1319 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1320 * threads to fail to exit normally */
1321
1322 r = maybe_setgroups(0, NULL);
1323 if (r < 0)
1324 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1325 if (setresgid(gid, gid, gid) < 0)
1326 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1327 if (setresuid(uid, uid, uid) < 0)
1328 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1329
1330 (void) ignore_signals(SIGPIPE);
1331
1332 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1333 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1334 * this way. We rely on the control groups kill logic to do the rest for us. */
1335 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1336 goto child_finish;
1337
1338 /* Tell the parent that our setup is done. This is especially important regarding dropping
1339 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1340 *
1341 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1342 (void) barrier_place(&barrier);
1343
1344 /* Check if our parent process might already have died? */
1345 if (getppid() == parent_pid) {
1346 sigset_t ss;
1347
1348 assert_se(sigemptyset(&ss) >= 0);
1349 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1350
1351 for (;;) {
1352 if (sigwait(&ss, &sig) < 0) {
1353 if (errno == EINTR)
1354 continue;
1355
1356 goto child_finish;
1357 }
1358
1359 assert(sig == SIGTERM);
1360 break;
1361 }
1362 }
1363
1364 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1365 if (pam_code != PAM_SUCCESS)
1366 goto child_finish;
1367
1368 /* If our parent died we'll end the session */
1369 if (getppid() != parent_pid) {
1370 pam_code = pam_close_session(handle, flags);
1371 if (pam_code != PAM_SUCCESS)
1372 goto child_finish;
1373 }
1374
1375 ret = 0;
1376
1377 child_finish:
1378 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1379 * know about this. See pam_end(3) */
1380 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1381 _exit(ret);
1382 }
1383
1384 barrier_set_role(&barrier, BARRIER_PARENT);
1385
1386 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1387 * here. */
1388 handle = NULL;
1389
1390 /* Unblock SIGTERM again in the parent */
1391 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1392
1393 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1394 * this fd around. */
1395 closelog();
1396
1397 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1398 * recover. However, warn loudly if it happens. */
1399 if (!barrier_place_and_sync(&barrier))
1400 log_error("PAM initialization failed");
1401
1402 return strv_free_and_replace(*env, e);
1403
1404 fail:
1405 if (pam_code != PAM_SUCCESS) {
1406 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1407 r = -EPERM; /* PAM errors do not map to errno */
1408 } else
1409 log_error_errno(r, "PAM failed: %m");
1410
1411 if (handle) {
1412 if (close_session)
1413 pam_code = pam_close_session(handle, flags);
1414
1415 (void) pam_end(handle, pam_code | flags);
1416 }
1417
1418 closelog();
1419 return r;
1420 #else
1421 return 0;
1422 #endif
1423 }
1424
1425 static void rename_process_from_path(const char *path) {
1426 _cleanup_free_ char *buf = NULL;
1427 const char *p;
1428
1429 assert(path);
1430
1431 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1432 * /bin/ps */
1433
1434 if (path_extract_filename(path, &buf) < 0) {
1435 rename_process("(...)");
1436 return;
1437 }
1438
1439 size_t l = strlen(buf);
1440 if (l > 8) {
1441 /* The end of the process name is usually more interesting, since the first bit might just be
1442 * "systemd-" */
1443 p = buf + l - 8;
1444 l = 8;
1445 } else
1446 p = buf;
1447
1448 char process_name[11];
1449 process_name[0] = '(';
1450 memcpy(process_name+1, p, l);
1451 process_name[1+l] = ')';
1452 process_name[1+l+1] = 0;
1453
1454 rename_process(process_name);
1455 }
1456
1457 static bool context_has_address_families(const ExecContext *c) {
1458 assert(c);
1459
1460 return c->address_families_allow_list ||
1461 !set_isempty(c->address_families);
1462 }
1463
1464 static bool context_has_syscall_filters(const ExecContext *c) {
1465 assert(c);
1466
1467 return c->syscall_allow_list ||
1468 !hashmap_isempty(c->syscall_filter);
1469 }
1470
1471 static bool context_has_syscall_logs(const ExecContext *c) {
1472 assert(c);
1473
1474 return c->syscall_log_allow_list ||
1475 !hashmap_isempty(c->syscall_log);
1476 }
1477
1478 static bool context_has_no_new_privileges(const ExecContext *c) {
1479 assert(c);
1480
1481 if (c->no_new_privileges)
1482 return true;
1483
1484 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1485 return false;
1486
1487 /* We need NNP if we have any form of seccomp and are unprivileged */
1488 return c->lock_personality ||
1489 c->memory_deny_write_execute ||
1490 c->private_devices ||
1491 c->protect_clock ||
1492 c->protect_hostname ||
1493 c->protect_kernel_tunables ||
1494 c->protect_kernel_modules ||
1495 c->protect_kernel_logs ||
1496 context_has_address_families(c) ||
1497 exec_context_restrict_namespaces_set(c) ||
1498 c->restrict_realtime ||
1499 c->restrict_suid_sgid ||
1500 !set_isempty(c->syscall_archs) ||
1501 context_has_syscall_filters(c) ||
1502 context_has_syscall_logs(c);
1503 }
1504
1505 #if HAVE_SECCOMP
1506
1507 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1508
1509 if (is_seccomp_available())
1510 return false;
1511
1512 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1513 return true;
1514 }
1515
1516 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1517 uint32_t negative_action, default_action, action;
1518 int r;
1519
1520 assert(u);
1521 assert(c);
1522
1523 if (!context_has_syscall_filters(c))
1524 return 0;
1525
1526 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1527 return 0;
1528
1529 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1530
1531 if (c->syscall_allow_list) {
1532 default_action = negative_action;
1533 action = SCMP_ACT_ALLOW;
1534 } else {
1535 default_action = SCMP_ACT_ALLOW;
1536 action = negative_action;
1537 }
1538
1539 if (needs_ambient_hack) {
1540 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1541 if (r < 0)
1542 return r;
1543 }
1544
1545 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1546 }
1547
1548 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1549 #ifdef SCMP_ACT_LOG
1550 uint32_t default_action, action;
1551 #endif
1552
1553 assert(u);
1554 assert(c);
1555
1556 if (!context_has_syscall_logs(c))
1557 return 0;
1558
1559 #ifdef SCMP_ACT_LOG
1560 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1561 return 0;
1562
1563 if (c->syscall_log_allow_list) {
1564 /* Log nothing but the ones listed */
1565 default_action = SCMP_ACT_ALLOW;
1566 action = SCMP_ACT_LOG;
1567 } else {
1568 /* Log everything but the ones listed */
1569 default_action = SCMP_ACT_LOG;
1570 action = SCMP_ACT_ALLOW;
1571 }
1572
1573 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1574 #else
1575 /* old libseccomp */
1576 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1577 return 0;
1578 #endif
1579 }
1580
1581 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1582 assert(u);
1583 assert(c);
1584
1585 if (set_isempty(c->syscall_archs))
1586 return 0;
1587
1588 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1589 return 0;
1590
1591 return seccomp_restrict_archs(c->syscall_archs);
1592 }
1593
1594 static int apply_address_families(const Unit* u, const ExecContext *c) {
1595 assert(u);
1596 assert(c);
1597
1598 if (!context_has_address_families(c))
1599 return 0;
1600
1601 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1602 return 0;
1603
1604 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1605 }
1606
1607 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1608 int r;
1609
1610 assert(u);
1611 assert(c);
1612
1613 if (!c->memory_deny_write_execute)
1614 return 0;
1615
1616 /* use prctl() if kernel supports it (6.3) */
1617 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1618 if (r == 0) {
1619 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1620 return 0;
1621 }
1622 if (r < 0 && errno != EINVAL)
1623 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1624 /* else use seccomp */
1625 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1626
1627 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1628 return 0;
1629
1630 return seccomp_memory_deny_write_execute();
1631 }
1632
1633 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1634 assert(u);
1635 assert(c);
1636
1637 if (!c->restrict_realtime)
1638 return 0;
1639
1640 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1641 return 0;
1642
1643 return seccomp_restrict_realtime();
1644 }
1645
1646 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1647 assert(u);
1648 assert(c);
1649
1650 if (!c->restrict_suid_sgid)
1651 return 0;
1652
1653 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1654 return 0;
1655
1656 return seccomp_restrict_suid_sgid();
1657 }
1658
1659 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1660 assert(u);
1661 assert(c);
1662
1663 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1664 * let's protect even those systems where this is left on in the kernel. */
1665
1666 if (!c->protect_kernel_tunables)
1667 return 0;
1668
1669 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1670 return 0;
1671
1672 return seccomp_protect_sysctl();
1673 }
1674
1675 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1676 assert(u);
1677 assert(c);
1678
1679 /* Turn off module syscalls on ProtectKernelModules=yes */
1680
1681 if (!c->protect_kernel_modules)
1682 return 0;
1683
1684 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1685 return 0;
1686
1687 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1688 }
1689
1690 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1691 assert(u);
1692 assert(c);
1693
1694 if (!c->protect_kernel_logs)
1695 return 0;
1696
1697 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1698 return 0;
1699
1700 return seccomp_protect_syslog();
1701 }
1702
1703 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1704 assert(u);
1705 assert(c);
1706
1707 if (!c->protect_clock)
1708 return 0;
1709
1710 if (skip_seccomp_unavailable(u, "ProtectClock="))
1711 return 0;
1712
1713 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1714 }
1715
1716 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1717 assert(u);
1718 assert(c);
1719
1720 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1721
1722 if (!c->private_devices)
1723 return 0;
1724
1725 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1726 return 0;
1727
1728 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1729 }
1730
1731 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1732 assert(u);
1733 assert(c);
1734
1735 if (!exec_context_restrict_namespaces_set(c))
1736 return 0;
1737
1738 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1739 return 0;
1740
1741 return seccomp_restrict_namespaces(c->restrict_namespaces);
1742 }
1743
1744 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1745 unsigned long personality;
1746 int r;
1747
1748 assert(u);
1749 assert(c);
1750
1751 if (!c->lock_personality)
1752 return 0;
1753
1754 if (skip_seccomp_unavailable(u, "LockPersonality="))
1755 return 0;
1756
1757 personality = c->personality;
1758
1759 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1760 if (personality == PERSONALITY_INVALID) {
1761
1762 r = opinionated_personality(&personality);
1763 if (r < 0)
1764 return r;
1765 }
1766
1767 return seccomp_lock_personality(personality);
1768 }
1769
1770 #endif
1771
1772 #if HAVE_LIBBPF
1773 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1774 assert(u);
1775 assert(c);
1776
1777 if (!exec_context_restrict_filesystems_set(c))
1778 return 0;
1779
1780 if (!u->manager->restrict_fs) {
1781 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1782 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1783 return 0;
1784 }
1785
1786 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1787 }
1788 #endif
1789
1790 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1791 assert(u);
1792 assert(c);
1793
1794 if (!c->protect_hostname)
1795 return 0;
1796
1797 if (ns_type_supported(NAMESPACE_UTS)) {
1798 if (unshare(CLONE_NEWUTS) < 0) {
1799 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1800 *ret_exit_status = EXIT_NAMESPACE;
1801 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1802 }
1803
1804 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1805 }
1806 } else
1807 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1808
1809 #if HAVE_SECCOMP
1810 int r;
1811
1812 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1813 return 0;
1814
1815 r = seccomp_protect_hostname();
1816 if (r < 0) {
1817 *ret_exit_status = EXIT_SECCOMP;
1818 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1819 }
1820 #endif
1821
1822 return 0;
1823 }
1824
1825 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1826 assert(idle_pipe);
1827
1828 idle_pipe[1] = safe_close(idle_pipe[1]);
1829 idle_pipe[2] = safe_close(idle_pipe[2]);
1830
1831 if (idle_pipe[0] >= 0) {
1832 int r;
1833
1834 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1835
1836 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1837 ssize_t n;
1838
1839 /* Signal systemd that we are bored and want to continue. */
1840 n = write(idle_pipe[3], "x", 1);
1841 if (n > 0)
1842 /* Wait for systemd to react to the signal above. */
1843 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1844 }
1845
1846 idle_pipe[0] = safe_close(idle_pipe[0]);
1847
1848 }
1849
1850 idle_pipe[3] = safe_close(idle_pipe[3]);
1851 }
1852
1853 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1854
1855 static int build_environment(
1856 const Unit *u,
1857 const ExecContext *c,
1858 const ExecParameters *p,
1859 const CGroupContext *cgroup_context,
1860 size_t n_fds,
1861 char **fdnames,
1862 const char *home,
1863 const char *username,
1864 const char *shell,
1865 dev_t journal_stream_dev,
1866 ino_t journal_stream_ino,
1867 const char *memory_pressure_path,
1868 char ***ret) {
1869
1870 _cleanup_strv_free_ char **our_env = NULL;
1871 size_t n_env = 0;
1872 char *x;
1873 int r;
1874
1875 assert(u);
1876 assert(c);
1877 assert(p);
1878 assert(ret);
1879
1880 #define N_ENV_VARS 19
1881 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1882 if (!our_env)
1883 return -ENOMEM;
1884
1885 if (n_fds > 0) {
1886 _cleanup_free_ char *joined = NULL;
1887
1888 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1889 return -ENOMEM;
1890 our_env[n_env++] = x;
1891
1892 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1893 return -ENOMEM;
1894 our_env[n_env++] = x;
1895
1896 joined = strv_join(fdnames, ":");
1897 if (!joined)
1898 return -ENOMEM;
1899
1900 x = strjoin("LISTEN_FDNAMES=", joined);
1901 if (!x)
1902 return -ENOMEM;
1903 our_env[n_env++] = x;
1904 }
1905
1906 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1907 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1908 return -ENOMEM;
1909 our_env[n_env++] = x;
1910
1911 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1912 return -ENOMEM;
1913 our_env[n_env++] = x;
1914 }
1915
1916 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1917 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1918 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1919 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1920 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1921 if (!x)
1922 return -ENOMEM;
1923 our_env[n_env++] = x;
1924 }
1925
1926 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1927 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1928 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1929 * SetLoginEnvironment= switch. */
1930 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1931 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1932 if (r < 0)
1933 return log_unit_error_errno(u, r, "Failed to determine user credentials for root: %m");
1934 }
1935
1936 bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
1937
1938 if (username) {
1939 x = strjoin("USER=", username);
1940 if (!x)
1941 return -ENOMEM;
1942 our_env[n_env++] = x;
1943
1944 if (set_user_login_env) {
1945 x = strjoin("LOGNAME=", username);
1946 if (!x)
1947 return -ENOMEM;
1948 our_env[n_env++] = x;
1949 }
1950 }
1951
1952 if (home && set_user_login_env) {
1953 x = strjoin("HOME=", home);
1954 if (!x)
1955 return -ENOMEM;
1956
1957 path_simplify(x + 5);
1958 our_env[n_env++] = x;
1959 }
1960
1961 if (shell && set_user_login_env) {
1962 x = strjoin("SHELL=", shell);
1963 if (!x)
1964 return -ENOMEM;
1965
1966 path_simplify(x + 6);
1967 our_env[n_env++] = x;
1968 }
1969
1970 if (!sd_id128_is_null(u->invocation_id)) {
1971 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1972 return -ENOMEM;
1973
1974 our_env[n_env++] = x;
1975 }
1976
1977 if (exec_context_needs_term(c)) {
1978 _cleanup_free_ char *cmdline = NULL;
1979 const char *tty_path, *term = NULL;
1980
1981 tty_path = exec_context_tty_path(c);
1982
1983 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1984 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1985 * container manager passes to PID 1 ends up all the way in the console login shown. */
1986
1987 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1988 term = getenv("TERM");
1989 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1990 _cleanup_free_ char *key = NULL;
1991
1992 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1993 if (!key)
1994 return -ENOMEM;
1995
1996 r = proc_cmdline_get_key(key, 0, &cmdline);
1997 if (r < 0)
1998 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
1999 else if (r > 0)
2000 term = cmdline;
2001 }
2002
2003 if (!term)
2004 term = default_term_for_tty(tty_path);
2005
2006 x = strjoin("TERM=", term);
2007 if (!x)
2008 return -ENOMEM;
2009 our_env[n_env++] = x;
2010 }
2011
2012 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2013 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2014 return -ENOMEM;
2015
2016 our_env[n_env++] = x;
2017 }
2018
2019 if (c->log_namespace) {
2020 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2021 if (!x)
2022 return -ENOMEM;
2023
2024 our_env[n_env++] = x;
2025 }
2026
2027 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2028 _cleanup_free_ char *joined = NULL;
2029 const char *n;
2030
2031 if (!p->prefix[t])
2032 continue;
2033
2034 if (c->directories[t].n_items == 0)
2035 continue;
2036
2037 n = exec_directory_env_name_to_string(t);
2038 if (!n)
2039 continue;
2040
2041 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2042 _cleanup_free_ char *prefixed = NULL;
2043
2044 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2045 if (!prefixed)
2046 return -ENOMEM;
2047
2048 if (!strextend_with_separator(&joined, ":", prefixed))
2049 return -ENOMEM;
2050 }
2051
2052 x = strjoin(n, "=", joined);
2053 if (!x)
2054 return -ENOMEM;
2055
2056 our_env[n_env++] = x;
2057 }
2058
2059 _cleanup_free_ char *creds_dir = NULL;
2060 r = exec_context_get_credential_directory(c, p, u->id, &creds_dir);
2061 if (r < 0)
2062 return r;
2063 if (r > 0) {
2064 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2065 if (!x)
2066 return -ENOMEM;
2067
2068 our_env[n_env++] = x;
2069 }
2070
2071 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2072 return -ENOMEM;
2073
2074 our_env[n_env++] = x;
2075
2076 if (memory_pressure_path) {
2077 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2078 if (!x)
2079 return -ENOMEM;
2080
2081 our_env[n_env++] = x;
2082
2083 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2084 _cleanup_free_ char *b = NULL, *e = NULL;
2085
2086 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2087 MEMORY_PRESSURE_DEFAULT_TYPE,
2088 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2089 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2090 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2091 return -ENOMEM;
2092
2093 if (base64mem(b, strlen(b) + 1, &e) < 0)
2094 return -ENOMEM;
2095
2096 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2097 if (!x)
2098 return -ENOMEM;
2099
2100 our_env[n_env++] = x;
2101 }
2102 }
2103
2104 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2105 #undef N_ENV_VARS
2106
2107 *ret = TAKE_PTR(our_env);
2108
2109 return 0;
2110 }
2111
2112 static int build_pass_environment(const ExecContext *c, char ***ret) {
2113 _cleanup_strv_free_ char **pass_env = NULL;
2114 size_t n_env = 0;
2115
2116 STRV_FOREACH(i, c->pass_environment) {
2117 _cleanup_free_ char *x = NULL;
2118 char *v;
2119
2120 v = getenv(*i);
2121 if (!v)
2122 continue;
2123 x = strjoin(*i, "=", v);
2124 if (!x)
2125 return -ENOMEM;
2126
2127 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2128 return -ENOMEM;
2129
2130 pass_env[n_env++] = TAKE_PTR(x);
2131 pass_env[n_env] = NULL;
2132 }
2133
2134 *ret = TAKE_PTR(pass_env);
2135
2136 return 0;
2137 }
2138
2139 bool exec_needs_network_namespace(const ExecContext *context) {
2140 assert(context);
2141
2142 return context->private_network || context->network_namespace_path;
2143 }
2144
2145 static bool exec_needs_ephemeral(const ExecContext *context) {
2146 return (context->root_image || context->root_directory) && context->root_ephemeral;
2147 }
2148
2149 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2150 assert(context);
2151
2152 return context->private_ipc || context->ipc_namespace_path;
2153 }
2154
2155 bool exec_needs_mount_namespace(
2156 const ExecContext *context,
2157 const ExecParameters *params,
2158 const ExecRuntime *runtime) {
2159
2160 assert(context);
2161
2162 if (context->root_image)
2163 return true;
2164
2165 if (!strv_isempty(context->read_write_paths) ||
2166 !strv_isempty(context->read_only_paths) ||
2167 !strv_isempty(context->inaccessible_paths) ||
2168 !strv_isempty(context->exec_paths) ||
2169 !strv_isempty(context->no_exec_paths))
2170 return true;
2171
2172 if (context->n_bind_mounts > 0)
2173 return true;
2174
2175 if (context->n_temporary_filesystems > 0)
2176 return true;
2177
2178 if (context->n_mount_images > 0)
2179 return true;
2180
2181 if (context->n_extension_images > 0)
2182 return true;
2183
2184 if (!strv_isempty(context->extension_directories))
2185 return true;
2186
2187 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2188 return true;
2189
2190 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2191 return true;
2192
2193 if (context->private_devices ||
2194 context->private_mounts > 0 ||
2195 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2196 context->protect_system != PROTECT_SYSTEM_NO ||
2197 context->protect_home != PROTECT_HOME_NO ||
2198 context->protect_kernel_tunables ||
2199 context->protect_kernel_modules ||
2200 context->protect_kernel_logs ||
2201 context->protect_control_groups ||
2202 context->protect_proc != PROTECT_PROC_DEFAULT ||
2203 context->proc_subset != PROC_SUBSET_ALL ||
2204 exec_needs_ipc_namespace(context))
2205 return true;
2206
2207 if (context->root_directory) {
2208 if (exec_context_get_effective_mount_apivfs(context))
2209 return true;
2210
2211 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2212 if (params && !params->prefix[t])
2213 continue;
2214
2215 if (context->directories[t].n_items > 0)
2216 return true;
2217 }
2218 }
2219
2220 if (context->dynamic_user &&
2221 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2222 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2223 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2224 return true;
2225
2226 if (context->log_namespace)
2227 return true;
2228
2229 return false;
2230 }
2231
2232 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2233 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2234 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2235 _cleanup_close_ int unshare_ready_fd = -EBADF;
2236 _cleanup_(sigkill_waitp) pid_t pid = 0;
2237 uint64_t c = 1;
2238 ssize_t n;
2239 int r;
2240
2241 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2242 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2243 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2244 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2245 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2246 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2247 * continues execution normally.
2248 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2249 * does not need CAP_SETUID to write the single line mapping to itself. */
2250
2251 /* Can only set up multiple mappings with CAP_SETUID. */
2252 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2253 r = asprintf(&uid_map,
2254 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2255 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2256 ouid, ouid, uid, uid);
2257 else
2258 r = asprintf(&uid_map,
2259 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2260 ouid, ouid);
2261
2262 if (r < 0)
2263 return -ENOMEM;
2264
2265 /* Can only set up multiple mappings with CAP_SETGID. */
2266 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2267 r = asprintf(&gid_map,
2268 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2269 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2270 ogid, ogid, gid, gid);
2271 else
2272 r = asprintf(&gid_map,
2273 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2274 ogid, ogid);
2275
2276 if (r < 0)
2277 return -ENOMEM;
2278
2279 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2280 * namespace. */
2281 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2282 if (unshare_ready_fd < 0)
2283 return -errno;
2284
2285 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2286 * failed. */
2287 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2288 return -errno;
2289
2290 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2291 if (r < 0)
2292 return r;
2293 if (r == 0) {
2294 _cleanup_close_ int fd = -EBADF;
2295 const char *a;
2296 pid_t ppid;
2297
2298 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2299 * here, after the parent opened its own user namespace. */
2300
2301 ppid = getppid();
2302 errno_pipe[0] = safe_close(errno_pipe[0]);
2303
2304 /* Wait until the parent unshared the user namespace */
2305 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2306 r = -errno;
2307 goto child_fail;
2308 }
2309
2310 /* Disable the setgroups() system call in the child user namespace, for good. */
2311 a = procfs_file_alloca(ppid, "setgroups");
2312 fd = open(a, O_WRONLY|O_CLOEXEC);
2313 if (fd < 0) {
2314 if (errno != ENOENT) {
2315 r = -errno;
2316 goto child_fail;
2317 }
2318
2319 /* If the file is missing the kernel is too old, let's continue anyway. */
2320 } else {
2321 if (write(fd, "deny\n", 5) < 0) {
2322 r = -errno;
2323 goto child_fail;
2324 }
2325
2326 fd = safe_close(fd);
2327 }
2328
2329 /* First write the GID map */
2330 a = procfs_file_alloca(ppid, "gid_map");
2331 fd = open(a, O_WRONLY|O_CLOEXEC);
2332 if (fd < 0) {
2333 r = -errno;
2334 goto child_fail;
2335 }
2336 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2337 r = -errno;
2338 goto child_fail;
2339 }
2340 fd = safe_close(fd);
2341
2342 /* The write the UID map */
2343 a = procfs_file_alloca(ppid, "uid_map");
2344 fd = open(a, O_WRONLY|O_CLOEXEC);
2345 if (fd < 0) {
2346 r = -errno;
2347 goto child_fail;
2348 }
2349 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2350 r = -errno;
2351 goto child_fail;
2352 }
2353
2354 _exit(EXIT_SUCCESS);
2355
2356 child_fail:
2357 (void) write(errno_pipe[1], &r, sizeof(r));
2358 _exit(EXIT_FAILURE);
2359 }
2360
2361 errno_pipe[1] = safe_close(errno_pipe[1]);
2362
2363 if (unshare(CLONE_NEWUSER) < 0)
2364 return -errno;
2365
2366 /* Let the child know that the namespace is ready now */
2367 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2368 return -errno;
2369
2370 /* Try to read an error code from the child */
2371 n = read(errno_pipe[0], &r, sizeof(r));
2372 if (n < 0)
2373 return -errno;
2374 if (n == sizeof(r)) { /* an error code was sent to us */
2375 if (r < 0)
2376 return r;
2377 return -EIO;
2378 }
2379 if (n != 0) /* on success we should have read 0 bytes */
2380 return -EIO;
2381
2382 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2383 if (r < 0)
2384 return r;
2385 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2386 return -EIO;
2387
2388 return 0;
2389 }
2390
2391 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2392 assert(context);
2393
2394 if (!context->dynamic_user)
2395 return false;
2396
2397 if (type == EXEC_DIRECTORY_CONFIGURATION)
2398 return false;
2399
2400 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2401 return false;
2402
2403 return true;
2404 }
2405
2406 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2407 _cleanup_free_ char *src_abs = NULL;
2408 int r;
2409
2410 assert(source);
2411
2412 src_abs = path_join(root, source);
2413 if (!src_abs)
2414 return -ENOMEM;
2415
2416 STRV_FOREACH(dst, symlinks) {
2417 _cleanup_free_ char *dst_abs = NULL;
2418
2419 dst_abs = path_join(root, *dst);
2420 if (!dst_abs)
2421 return -ENOMEM;
2422
2423 r = mkdir_parents_label(dst_abs, 0755);
2424 if (r < 0)
2425 return r;
2426
2427 r = symlink_idempotent(src_abs, dst_abs, true);
2428 if (r < 0)
2429 return r;
2430 }
2431
2432 return 0;
2433 }
2434
2435 static int setup_exec_directory(
2436 Unit *u,
2437 const ExecContext *context,
2438 const ExecParameters *params,
2439 uid_t uid,
2440 gid_t gid,
2441 ExecDirectoryType type,
2442 bool needs_mount_namespace,
2443 int *exit_status) {
2444
2445 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2446 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2447 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2448 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2449 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2450 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2451 };
2452 int r;
2453
2454 assert(context);
2455 assert(params);
2456 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2457 assert(exit_status);
2458
2459 if (!params->prefix[type])
2460 return 0;
2461
2462 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2463 if (!uid_is_valid(uid))
2464 uid = 0;
2465 if (!gid_is_valid(gid))
2466 gid = 0;
2467 }
2468
2469 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2470 _cleanup_free_ char *p = NULL, *pp = NULL;
2471
2472 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2473 if (!p) {
2474 r = -ENOMEM;
2475 goto fail;
2476 }
2477
2478 r = mkdir_parents_label(p, 0755);
2479 if (r < 0)
2480 goto fail;
2481
2482 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2483
2484 /* If we are in user mode, and a configuration directory exists but a state directory
2485 * doesn't exist, then we likely are upgrading from an older systemd version that
2486 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2487 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2488 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2489 * separated. If a service has both dirs configured but only the configuration dir
2490 * exists and the state dir does not, we assume we are looking at an update
2491 * situation. Hence, create a compatibility symlink, so that all expectations are
2492 * met.
2493 *
2494 * (We also do something similar with the log directory, which still doesn't exist in
2495 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2496
2497 /* this assumes the state dir is always created before the configuration dir */
2498 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2499 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2500
2501 r = laccess(p, F_OK);
2502 if (r == -ENOENT) {
2503 _cleanup_free_ char *q = NULL;
2504
2505 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2506 * under the configuration hierarchy. */
2507
2508 if (type == EXEC_DIRECTORY_STATE)
2509 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2510 else if (type == EXEC_DIRECTORY_LOGS)
2511 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2512 else
2513 assert_not_reached();
2514 if (!q) {
2515 r = -ENOMEM;
2516 goto fail;
2517 }
2518
2519 r = laccess(q, F_OK);
2520 if (r >= 0) {
2521 /* It does exist! This hence looks like an update. Symlink the
2522 * configuration directory into the state directory. */
2523
2524 r = symlink_idempotent(q, p, /* make_relative= */ true);
2525 if (r < 0)
2526 goto fail;
2527
2528 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2529 continue;
2530 } else if (r != -ENOENT)
2531 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2532
2533 } else if (r < 0)
2534 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2535 }
2536
2537 if (exec_directory_is_private(context, type)) {
2538 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2539 * case we want to avoid leaving a directory around fully accessible that is owned by
2540 * a dynamic user whose UID is later on reused. To lock this down we use the same
2541 * trick used by container managers to prohibit host users to get access to files of
2542 * the same UID in containers: we place everything inside a directory that has an
2543 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2544 * for unprivileged host code. We then use fs namespacing to make this directory
2545 * permeable for the service itself.
2546 *
2547 * Specifically: for a service which wants a special directory "foo/" we first create
2548 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2549 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2550 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2551 * unprivileged host users can't look into it. Inside of the namespace of the unit
2552 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2553 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2554 * for the service and making sure it only gets access to the dirs it needs but no
2555 * others. Tricky? Yes, absolutely, but it works!
2556 *
2557 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2558 * to be owned by the service itself.
2559 *
2560 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2561 * for sharing files or sockets with other services. */
2562
2563 pp = path_join(params->prefix[type], "private");
2564 if (!pp) {
2565 r = -ENOMEM;
2566 goto fail;
2567 }
2568
2569 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2570 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2571 if (r < 0)
2572 goto fail;
2573
2574 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2575 r = -ENOMEM;
2576 goto fail;
2577 }
2578
2579 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2580 r = mkdir_parents_label(pp, 0755);
2581 if (r < 0)
2582 goto fail;
2583
2584 if (is_dir(p, false) > 0 &&
2585 (laccess(pp, F_OK) == -ENOENT)) {
2586
2587 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2588 * it over. Most likely the service has been upgraded from one that didn't use
2589 * DynamicUser=1, to one that does. */
2590
2591 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2592 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2593 exec_directory_type_to_string(type), p, pp);
2594
2595 r = RET_NERRNO(rename(p, pp));
2596 if (r < 0)
2597 goto fail;
2598 } else {
2599 /* Otherwise, create the actual directory for the service */
2600
2601 r = mkdir_label(pp, context->directories[type].mode);
2602 if (r < 0 && r != -EEXIST)
2603 goto fail;
2604 }
2605
2606 if (!context->directories[type].items[i].only_create) {
2607 /* And link it up from the original place.
2608 * Notes
2609 * 1) If a mount namespace is going to be used, then this symlink remains on
2610 * the host, and a new one for the child namespace will be created later.
2611 * 2) It is not necessary to create this symlink when one of its parent
2612 * directories is specified and already created. E.g.
2613 * StateDirectory=foo foo/bar
2614 * In that case, the inode points to pp and p for "foo/bar" are the same:
2615 * pp = "/var/lib/private/foo/bar"
2616 * p = "/var/lib/foo/bar"
2617 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2618 * we do not need to create the symlink, but we cannot create the symlink.
2619 * See issue #24783. */
2620 r = symlink_idempotent(pp, p, true);
2621 if (r < 0)
2622 goto fail;
2623 }
2624
2625 } else {
2626 _cleanup_free_ char *target = NULL;
2627
2628 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2629 readlink_and_make_absolute(p, &target) >= 0) {
2630 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2631
2632 /* This already exists and is a symlink? Interesting. Maybe it's one created
2633 * by DynamicUser=1 (see above)?
2634 *
2635 * We do this for all directory types except for ConfigurationDirectory=,
2636 * since they all support the private/ symlink logic at least in some
2637 * configurations, see above. */
2638
2639 r = chase(target, NULL, 0, &target_resolved, NULL);
2640 if (r < 0)
2641 goto fail;
2642
2643 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2644 if (!q) {
2645 r = -ENOMEM;
2646 goto fail;
2647 }
2648
2649 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2650 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2651 if (r < 0)
2652 goto fail;
2653
2654 if (path_equal(q_resolved, target_resolved)) {
2655
2656 /* Hmm, apparently DynamicUser= was once turned on for this service,
2657 * but is no longer. Let's move the directory back up. */
2658
2659 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2660 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2661 exec_directory_type_to_string(type), q, p);
2662
2663 r = RET_NERRNO(unlink(p));
2664 if (r < 0)
2665 goto fail;
2666
2667 r = RET_NERRNO(rename(q, p));
2668 if (r < 0)
2669 goto fail;
2670 }
2671 }
2672
2673 r = mkdir_label(p, context->directories[type].mode);
2674 if (r < 0) {
2675 if (r != -EEXIST)
2676 goto fail;
2677
2678 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2679 struct stat st;
2680
2681 /* Don't change the owner/access mode of the configuration directory,
2682 * as in the common case it is not written to by a service, and shall
2683 * not be writable. */
2684
2685 r = RET_NERRNO(stat(p, &st));
2686 if (r < 0)
2687 goto fail;
2688
2689 /* Still complain if the access mode doesn't match */
2690 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2691 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2692 "(File system: %o %sMode: %o)",
2693 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2694 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2695
2696 continue;
2697 }
2698 }
2699 }
2700
2701 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2702 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2703 * current UID/GID ownership.) */
2704 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2705 if (r < 0)
2706 goto fail;
2707
2708 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2709 * available to user code anyway */
2710 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2711 continue;
2712
2713 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2714 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2715 * assignments to exist. */
2716 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2717 if (r < 0)
2718 goto fail;
2719 }
2720
2721 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2722 * they are set up later, to allow configuring empty var/run/etc. */
2723 if (!needs_mount_namespace)
2724 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2725 r = create_many_symlinks(params->prefix[type],
2726 context->directories[type].items[i].path,
2727 context->directories[type].items[i].symlinks);
2728 if (r < 0)
2729 goto fail;
2730 }
2731
2732 return 0;
2733
2734 fail:
2735 *exit_status = exit_status_table[type];
2736 return r;
2737 }
2738
2739 #if ENABLE_SMACK
2740 static int setup_smack(
2741 const Manager *manager,
2742 const ExecContext *context,
2743 int executable_fd) {
2744 int r;
2745
2746 assert(context);
2747 assert(executable_fd >= 0);
2748
2749 if (context->smack_process_label) {
2750 r = mac_smack_apply_pid(0, context->smack_process_label);
2751 if (r < 0)
2752 return r;
2753 } else if (manager->defaults.smack_process_label) {
2754 _cleanup_free_ char *exec_label = NULL;
2755
2756 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2757 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2758 return r;
2759
2760 r = mac_smack_apply_pid(0, exec_label ?: manager->defaults.smack_process_label);
2761 if (r < 0)
2762 return r;
2763 }
2764
2765 return 0;
2766 }
2767 #endif
2768
2769 static int compile_bind_mounts(
2770 const ExecContext *context,
2771 const ExecParameters *params,
2772 BindMount **ret_bind_mounts,
2773 size_t *ret_n_bind_mounts,
2774 char ***ret_empty_directories) {
2775
2776 _cleanup_strv_free_ char **empty_directories = NULL;
2777 BindMount *bind_mounts = NULL;
2778 size_t n, h = 0;
2779 int r;
2780
2781 assert(context);
2782 assert(params);
2783 assert(ret_bind_mounts);
2784 assert(ret_n_bind_mounts);
2785 assert(ret_empty_directories);
2786
2787 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2788
2789 n = context->n_bind_mounts;
2790 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2791 if (!params->prefix[t])
2792 continue;
2793
2794 for (size_t i = 0; i < context->directories[t].n_items; i++)
2795 n += !context->directories[t].items[i].only_create;
2796 }
2797
2798 if (n <= 0) {
2799 *ret_bind_mounts = NULL;
2800 *ret_n_bind_mounts = 0;
2801 *ret_empty_directories = NULL;
2802 return 0;
2803 }
2804
2805 bind_mounts = new(BindMount, n);
2806 if (!bind_mounts)
2807 return -ENOMEM;
2808
2809 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2810 BindMount *item = context->bind_mounts + i;
2811 _cleanup_free_ char *s = NULL, *d = NULL;
2812
2813 s = strdup(item->source);
2814 if (!s)
2815 return -ENOMEM;
2816
2817 d = strdup(item->destination);
2818 if (!d)
2819 return -ENOMEM;
2820
2821 bind_mounts[h++] = (BindMount) {
2822 .source = TAKE_PTR(s),
2823 .destination = TAKE_PTR(d),
2824 .read_only = item->read_only,
2825 .recursive = item->recursive,
2826 .ignore_enoent = item->ignore_enoent,
2827 };
2828 }
2829
2830 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2831 if (!params->prefix[t])
2832 continue;
2833
2834 if (context->directories[t].n_items == 0)
2835 continue;
2836
2837 if (exec_directory_is_private(context, t) &&
2838 !exec_context_with_rootfs(context)) {
2839 char *private_root;
2840
2841 /* So this is for a dynamic user, and we need to make sure the process can access its own
2842 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2843 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2844
2845 private_root = path_join(params->prefix[t], "private");
2846 if (!private_root)
2847 return -ENOMEM;
2848
2849 r = strv_consume(&empty_directories, private_root);
2850 if (r < 0)
2851 return r;
2852 }
2853
2854 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2855 _cleanup_free_ char *s = NULL, *d = NULL;
2856
2857 /* When one of the parent directories is in the list, we cannot create the symlink
2858 * for the child directory. See also the comments in setup_exec_directory(). */
2859 if (context->directories[t].items[i].only_create)
2860 continue;
2861
2862 if (exec_directory_is_private(context, t))
2863 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2864 else
2865 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2866 if (!s)
2867 return -ENOMEM;
2868
2869 if (exec_directory_is_private(context, t) &&
2870 exec_context_with_rootfs(context))
2871 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2872 * directory is not created on the root directory. So, let's bind-mount the directory
2873 * on the 'non-private' place. */
2874 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2875 else
2876 d = strdup(s);
2877 if (!d)
2878 return -ENOMEM;
2879
2880 bind_mounts[h++] = (BindMount) {
2881 .source = TAKE_PTR(s),
2882 .destination = TAKE_PTR(d),
2883 .read_only = false,
2884 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2885 .recursive = true,
2886 .ignore_enoent = false,
2887 };
2888 }
2889 }
2890
2891 assert(h == n);
2892
2893 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2894 *ret_n_bind_mounts = n;
2895 *ret_empty_directories = TAKE_PTR(empty_directories);
2896
2897 return (int) n;
2898 }
2899
2900 /* ret_symlinks will contain a list of pairs src:dest that describes
2901 * the symlinks to create later on. For example, the symlinks needed
2902 * to safely give private directories to DynamicUser=1 users. */
2903 static int compile_symlinks(
2904 const ExecContext *context,
2905 const ExecParameters *params,
2906 bool setup_os_release_symlink,
2907 char ***ret_symlinks) {
2908
2909 _cleanup_strv_free_ char **symlinks = NULL;
2910 int r;
2911
2912 assert(context);
2913 assert(params);
2914 assert(ret_symlinks);
2915
2916 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2917 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2918 _cleanup_free_ char *private_path = NULL, *path = NULL;
2919
2920 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2921 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2922
2923 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2924 dst_abs = path_join(params->prefix[dt], *symlink);
2925 if (!src_abs || !dst_abs)
2926 return -ENOMEM;
2927
2928 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2929 if (r < 0)
2930 return r;
2931 }
2932
2933 if (!exec_directory_is_private(context, dt) ||
2934 exec_context_with_rootfs(context) ||
2935 context->directories[dt].items[i].only_create)
2936 continue;
2937
2938 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2939 if (!private_path)
2940 return -ENOMEM;
2941
2942 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2943 if (!path)
2944 return -ENOMEM;
2945
2946 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2947 if (r < 0)
2948 return r;
2949 }
2950 }
2951
2952 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2953 * and readers will never get a half-written version. Note that, while the paths specified here are
2954 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2955 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2956 if (setup_os_release_symlink) {
2957 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2958 if (r < 0)
2959 return r;
2960
2961 r = strv_extend(&symlinks, "/run/host/os-release");
2962 if (r < 0)
2963 return r;
2964 }
2965
2966 *ret_symlinks = TAKE_PTR(symlinks);
2967
2968 return 0;
2969 }
2970
2971 static bool insist_on_sandboxing(
2972 const ExecContext *context,
2973 const char *root_dir,
2974 const char *root_image,
2975 const BindMount *bind_mounts,
2976 size_t n_bind_mounts) {
2977
2978 assert(context);
2979 assert(n_bind_mounts == 0 || bind_mounts);
2980
2981 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2982 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2983 * rearrange stuff in a way we cannot ignore gracefully. */
2984
2985 if (context->n_temporary_filesystems > 0)
2986 return true;
2987
2988 if (root_dir || root_image)
2989 return true;
2990
2991 if (context->n_mount_images > 0)
2992 return true;
2993
2994 if (context->dynamic_user)
2995 return true;
2996
2997 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2998 return true;
2999
3000 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3001 * essential. */
3002 for (size_t i = 0; i < n_bind_mounts; i++)
3003 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3004 return true;
3005
3006 if (context->log_namespace)
3007 return true;
3008
3009 return false;
3010 }
3011
3012 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3013 _cleanup_close_ int fd = -EBADF;
3014 int r;
3015
3016 if (!runtime || !runtime->ephemeral_copy)
3017 return 0;
3018
3019 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3020 if (r < 0)
3021 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3022
3023 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3024
3025 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3026 if (fd >= 0)
3027 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3028 return 0;
3029
3030 if (fd != -EAGAIN)
3031 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3032
3033 log_debug("Making ephemeral snapshot of %s to %s",
3034 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3035
3036 if (context->root_image)
3037 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3038 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3039 else
3040 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3041 AT_FDCWD, runtime->ephemeral_copy,
3042 BTRFS_SNAPSHOT_FALLBACK_COPY |
3043 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3044 BTRFS_SNAPSHOT_RECURSIVE |
3045 BTRFS_SNAPSHOT_LOCK_BSD);
3046 if (fd < 0)
3047 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3048 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3049
3050 if (context->root_image) {
3051 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3052 * which tends to not perform well in combination with lots of random writes.
3053 *
3054 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3055 * copy, but we at least want to make the intention clear.
3056 */
3057 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3058 if (r < 0)
3059 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3060 }
3061
3062 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3063 if (r < 0)
3064 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3065
3066 return 1;
3067 }
3068
3069 static int verity_settings_prepare(
3070 VeritySettings *verity,
3071 const char *root_image,
3072 const void *root_hash,
3073 size_t root_hash_size,
3074 const char *root_hash_path,
3075 const void *root_hash_sig,
3076 size_t root_hash_sig_size,
3077 const char *root_hash_sig_path,
3078 const char *verity_data_path) {
3079
3080 int r;
3081
3082 assert(verity);
3083
3084 if (root_hash) {
3085 void *d;
3086
3087 d = memdup(root_hash, root_hash_size);
3088 if (!d)
3089 return -ENOMEM;
3090
3091 free_and_replace(verity->root_hash, d);
3092 verity->root_hash_size = root_hash_size;
3093 verity->designator = PARTITION_ROOT;
3094 }
3095
3096 if (root_hash_sig) {
3097 void *d;
3098
3099 d = memdup(root_hash_sig, root_hash_sig_size);
3100 if (!d)
3101 return -ENOMEM;
3102
3103 free_and_replace(verity->root_hash_sig, d);
3104 verity->root_hash_sig_size = root_hash_sig_size;
3105 verity->designator = PARTITION_ROOT;
3106 }
3107
3108 if (verity_data_path) {
3109 r = free_and_strdup(&verity->data_path, verity_data_path);
3110 if (r < 0)
3111 return r;
3112 }
3113
3114 r = verity_settings_load(
3115 verity,
3116 root_image,
3117 root_hash_path,
3118 root_hash_sig_path);
3119 if (r < 0)
3120 return log_debug_errno(r, "Failed to load root hash: %m");
3121
3122 return 0;
3123 }
3124
3125 static int apply_mount_namespace(
3126 const Unit *u,
3127 ExecCommandFlags command_flags,
3128 const ExecContext *context,
3129 const ExecParameters *params,
3130 ExecRuntime *runtime,
3131 const char *memory_pressure_path,
3132 char **error_path) {
3133
3134 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3135 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3136 **read_write_paths_cleanup = NULL;
3137 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3138 *extension_dir = NULL, *host_os_release_stage = NULL;
3139 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3140 char **read_write_paths;
3141 bool needs_sandboxing, setup_os_release_symlink;
3142 BindMount *bind_mounts = NULL;
3143 size_t n_bind_mounts = 0;
3144 int r;
3145
3146 assert(context);
3147
3148 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3149
3150 if (params->flags & EXEC_APPLY_CHROOT) {
3151 r = setup_ephemeral(context, runtime);
3152 if (r < 0)
3153 return r;
3154
3155 if (context->root_image)
3156 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3157 else
3158 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3159 }
3160
3161 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3162 if (r < 0)
3163 return r;
3164
3165 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3166 * service will need to write to it in order to start the notifications. */
3167 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3168 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3169 if (!read_write_paths_cleanup)
3170 return -ENOMEM;
3171
3172 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3173 if (r < 0)
3174 return r;
3175
3176 read_write_paths = read_write_paths_cleanup;
3177 } else
3178 read_write_paths = context->read_write_paths;
3179
3180 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3181 if (needs_sandboxing) {
3182 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3183 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3184 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3185
3186 if (context->private_tmp && runtime && runtime->shared) {
3187 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3188 tmp_dir = runtime->shared->tmp_dir;
3189 else if (runtime->shared->tmp_dir)
3190 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3191
3192 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3193 var_tmp_dir = runtime->shared->var_tmp_dir;
3194 else if (runtime->shared->var_tmp_dir)
3195 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3196 }
3197 }
3198
3199 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3200 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3201 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3202 if (r < 0)
3203 return r;
3204
3205 if (context->mount_propagation_flag == MS_SHARED)
3206 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3207
3208 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3209 r = exec_context_get_credential_directory(context, params, u->id, &creds_path);
3210 if (r < 0)
3211 return r;
3212 }
3213
3214 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3215 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3216 if (!propagate_dir)
3217 return -ENOMEM;
3218
3219 incoming_dir = strdup("/run/systemd/incoming");
3220 if (!incoming_dir)
3221 return -ENOMEM;
3222
3223 extension_dir = strdup("/run/systemd/unit-extensions");
3224 if (!extension_dir)
3225 return -ENOMEM;
3226
3227 /* If running under a different root filesystem, propagate the host's os-release. We make a
3228 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3229 if (setup_os_release_symlink) {
3230 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3231 if (!host_os_release_stage)
3232 return -ENOMEM;
3233 }
3234 } else {
3235 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3236
3237 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3238 return -ENOMEM;
3239
3240 if (setup_os_release_symlink) {
3241 if (asprintf(&host_os_release_stage,
3242 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3243 geteuid()) < 0)
3244 return -ENOMEM;
3245 }
3246 }
3247
3248 if (root_image) {
3249 r = verity_settings_prepare(
3250 &verity,
3251 root_image,
3252 context->root_hash, context->root_hash_size, context->root_hash_path,
3253 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3254 context->root_verity);
3255 if (r < 0)
3256 return r;
3257 }
3258
3259 NamespaceParameters parameters = {
3260 .runtime_scope = params->runtime_scope,
3261
3262 .root_directory = root_dir,
3263 .root_image = root_image,
3264 .root_image_options = context->root_image_options,
3265 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3266
3267 .read_write_paths = read_write_paths,
3268 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3269 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3270
3271 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3272 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3273
3274 .empty_directories = empty_directories,
3275 .symlinks = symlinks,
3276
3277 .bind_mounts = bind_mounts,
3278 .n_bind_mounts = n_bind_mounts,
3279
3280 .temporary_filesystems = context->temporary_filesystems,
3281 .n_temporary_filesystems = context->n_temporary_filesystems,
3282
3283 .mount_images = context->mount_images,
3284 .n_mount_images = context->n_mount_images,
3285 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3286
3287 .tmp_dir = tmp_dir,
3288 .var_tmp_dir = var_tmp_dir,
3289
3290 .creds_path = creds_path,
3291 .log_namespace = context->log_namespace,
3292 .mount_propagation_flag = context->mount_propagation_flag,
3293
3294 .verity = &verity,
3295
3296 .extension_images = context->extension_images,
3297 .n_extension_images = context->n_extension_images,
3298 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3299 .extension_directories = context->extension_directories,
3300
3301 .propagate_dir = propagate_dir,
3302 .incoming_dir = incoming_dir,
3303 .extension_dir = extension_dir,
3304 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3305 .host_os_release_stage = host_os_release_stage,
3306
3307 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3308 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3309 * sandbox inside the mount namespace. */
3310 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3311
3312 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3313 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3314 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3315 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3316 .protect_hostname = needs_sandboxing && context->protect_hostname,
3317
3318 .private_dev = needs_sandboxing && context->private_devices,
3319 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3320 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3321
3322 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3323
3324 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3325 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3326
3327 .protect_home = needs_sandboxing && context->protect_home,
3328 .protect_system = needs_sandboxing && context->protect_system,
3329 .protect_proc = needs_sandboxing && context->protect_proc,
3330 .proc_subset = needs_sandboxing && context->proc_subset,
3331 };
3332
3333 r = setup_namespace(&parameters, error_path);
3334 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3335 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3336 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3337 * completely different execution environment. */
3338 if (r == -ENOANO) {
3339 if (insist_on_sandboxing(
3340 context,
3341 root_dir, root_image,
3342 bind_mounts,
3343 n_bind_mounts))
3344 return log_unit_debug_errno(u,
3345 SYNTHETIC_ERRNO(EOPNOTSUPP),
3346 "Failed to set up namespace, and refusing to continue since "
3347 "the selected namespacing options alter mount environment non-trivially.\n"
3348 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3349 n_bind_mounts,
3350 context->n_temporary_filesystems,
3351 yes_no(root_dir),
3352 yes_no(root_image),
3353 yes_no(context->dynamic_user));
3354
3355 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3356 return 0;
3357 }
3358
3359 return r;
3360 }
3361
3362 static int apply_working_directory(
3363 const ExecContext *context,
3364 const ExecParameters *params,
3365 ExecRuntime *runtime,
3366 const char *home,
3367 int *exit_status) {
3368
3369 const char *d, *wd;
3370
3371 assert(context);
3372 assert(exit_status);
3373
3374 if (context->working_directory_home) {
3375
3376 if (!home) {
3377 *exit_status = EXIT_CHDIR;
3378 return -ENXIO;
3379 }
3380
3381 wd = home;
3382
3383 } else
3384 wd = empty_to_root(context->working_directory);
3385
3386 if (params->flags & EXEC_APPLY_CHROOT)
3387 d = wd;
3388 else
3389 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3390
3391 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3392 *exit_status = EXIT_CHDIR;
3393 return -errno;
3394 }
3395
3396 return 0;
3397 }
3398
3399 static int apply_root_directory(
3400 const ExecContext *context,
3401 const ExecParameters *params,
3402 ExecRuntime *runtime,
3403 const bool needs_mount_ns,
3404 int *exit_status) {
3405
3406 assert(context);
3407 assert(exit_status);
3408
3409 if (params->flags & EXEC_APPLY_CHROOT)
3410 if (!needs_mount_ns && context->root_directory)
3411 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3412 *exit_status = EXIT_CHROOT;
3413 return -errno;
3414 }
3415
3416 return 0;
3417 }
3418
3419 static int setup_keyring(
3420 const Unit *u,
3421 const ExecContext *context,
3422 const ExecParameters *p,
3423 uid_t uid, gid_t gid) {
3424
3425 key_serial_t keyring;
3426 int r = 0;
3427 uid_t saved_uid;
3428 gid_t saved_gid;
3429
3430 assert(u);
3431 assert(context);
3432 assert(p);
3433
3434 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3435 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3436 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3437 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3438 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3439 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3440
3441 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3442 return 0;
3443
3444 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3445 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3446 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3447 * & group is just as nasty as acquiring a reference to the user keyring. */
3448
3449 saved_uid = getuid();
3450 saved_gid = getgid();
3451
3452 if (gid_is_valid(gid) && gid != saved_gid) {
3453 if (setregid(gid, -1) < 0)
3454 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3455 }
3456
3457 if (uid_is_valid(uid) && uid != saved_uid) {
3458 if (setreuid(uid, -1) < 0) {
3459 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3460 goto out;
3461 }
3462 }
3463
3464 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3465 if (keyring == -1) {
3466 if (errno == ENOSYS)
3467 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3468 else if (ERRNO_IS_PRIVILEGE(errno))
3469 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3470 else if (errno == EDQUOT)
3471 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3472 else
3473 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3474
3475 goto out;
3476 }
3477
3478 /* When requested link the user keyring into the session keyring. */
3479 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3480
3481 if (keyctl(KEYCTL_LINK,
3482 KEY_SPEC_USER_KEYRING,
3483 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3484 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3485 goto out;
3486 }
3487 }
3488
3489 /* Restore uid/gid back */
3490 if (uid_is_valid(uid) && uid != saved_uid) {
3491 if (setreuid(saved_uid, -1) < 0) {
3492 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3493 goto out;
3494 }
3495 }
3496
3497 if (gid_is_valid(gid) && gid != saved_gid) {
3498 if (setregid(saved_gid, -1) < 0)
3499 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3500 }
3501
3502 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3503 if (!sd_id128_is_null(u->invocation_id)) {
3504 key_serial_t key;
3505
3506 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3507 if (key == -1)
3508 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3509 else {
3510 if (keyctl(KEYCTL_SETPERM, key,
3511 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3512 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3513 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3514 }
3515 }
3516
3517 out:
3518 /* Revert back uid & gid for the last time, and exit */
3519 /* no extra logging, as only the first already reported error matters */
3520 if (getuid() != saved_uid)
3521 (void) setreuid(saved_uid, -1);
3522
3523 if (getgid() != saved_gid)
3524 (void) setregid(saved_gid, -1);
3525
3526 return r;
3527 }
3528
3529 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3530 assert(array);
3531 assert(n);
3532 assert(pair);
3533
3534 if (pair[0] >= 0)
3535 array[(*n)++] = pair[0];
3536 if (pair[1] >= 0)
3537 array[(*n)++] = pair[1];
3538 }
3539
3540 static int close_remaining_fds(
3541 const ExecParameters *params,
3542 const ExecRuntime *runtime,
3543 int user_lookup_fd,
3544 int socket_fd,
3545 const int *fds, size_t n_fds) {
3546
3547 size_t n_dont_close = 0;
3548 int dont_close[n_fds + 14];
3549
3550 assert(params);
3551
3552 if (params->stdin_fd >= 0)
3553 dont_close[n_dont_close++] = params->stdin_fd;
3554 if (params->stdout_fd >= 0)
3555 dont_close[n_dont_close++] = params->stdout_fd;
3556 if (params->stderr_fd >= 0)
3557 dont_close[n_dont_close++] = params->stderr_fd;
3558
3559 if (socket_fd >= 0)
3560 dont_close[n_dont_close++] = socket_fd;
3561 if (n_fds > 0) {
3562 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3563 n_dont_close += n_fds;
3564 }
3565
3566 if (runtime)
3567 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3568
3569 if (runtime && runtime->shared) {
3570 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3571 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3572 }
3573
3574 if (runtime && runtime->dynamic_creds) {
3575 if (runtime->dynamic_creds->user)
3576 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3577 if (runtime->dynamic_creds->group)
3578 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3579 }
3580
3581 if (user_lookup_fd >= 0)
3582 dont_close[n_dont_close++] = user_lookup_fd;
3583
3584 return close_all_fds(dont_close, n_dont_close);
3585 }
3586
3587 static int send_user_lookup(
3588 Unit *unit,
3589 int user_lookup_fd,
3590 uid_t uid,
3591 gid_t gid) {
3592
3593 assert(unit);
3594
3595 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3596 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3597 * specified. */
3598
3599 if (user_lookup_fd < 0)
3600 return 0;
3601
3602 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3603 return 0;
3604
3605 if (writev(user_lookup_fd,
3606 (struct iovec[]) {
3607 IOVEC_MAKE(&uid, sizeof(uid)),
3608 IOVEC_MAKE(&gid, sizeof(gid)),
3609 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
3610 return -errno;
3611
3612 return 0;
3613 }
3614
3615 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3616 int r;
3617
3618 assert(c);
3619 assert(home);
3620 assert(buf);
3621
3622 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3623
3624 if (*home)
3625 return 0;
3626
3627 if (!c->working_directory_home)
3628 return 0;
3629
3630 r = get_home_dir(buf);
3631 if (r < 0)
3632 return r;
3633
3634 *home = *buf;
3635 return 1;
3636 }
3637
3638 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3639 _cleanup_strv_free_ char ** list = NULL;
3640 int r;
3641
3642 assert(c);
3643 assert(p);
3644 assert(ret);
3645
3646 assert(c->dynamic_user);
3647
3648 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3649 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3650 * directories. */
3651
3652 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3653 if (t == EXEC_DIRECTORY_CONFIGURATION)
3654 continue;
3655
3656 if (!p->prefix[t])
3657 continue;
3658
3659 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3660 char *e;
3661
3662 if (exec_directory_is_private(c, t))
3663 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3664 else
3665 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3666 if (!e)
3667 return -ENOMEM;
3668
3669 r = strv_consume(&list, e);
3670 if (r < 0)
3671 return r;
3672 }
3673 }
3674
3675 *ret = TAKE_PTR(list);
3676
3677 return 0;
3678 }
3679
3680 static int exec_parameters_get_cgroup_path(
3681 const ExecParameters *params,
3682 const CGroupContext *c,
3683 char **ret) {
3684
3685 const char *subgroup = NULL;
3686 char *p;
3687
3688 assert(params);
3689 assert(ret);
3690
3691 if (!params->cgroup_path)
3692 return -EINVAL;
3693
3694 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3695 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3696 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3697 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3698 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3699 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3700 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3701 * flag, which is only passed for the former statements, not for the latter. */
3702
3703 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3704 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3705 subgroup = ".control";
3706 else
3707 subgroup = c->delegate_subgroup;
3708 }
3709
3710 if (subgroup)
3711 p = path_join(params->cgroup_path, subgroup);
3712 else
3713 p = strdup(params->cgroup_path);
3714 if (!p)
3715 return -ENOMEM;
3716
3717 *ret = p;
3718 return !!subgroup;
3719 }
3720
3721 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3722 _cleanup_(cpu_set_reset) CPUSet s = {};
3723 int r;
3724
3725 assert(c);
3726 assert(ret);
3727
3728 if (!c->numa_policy.nodes.set) {
3729 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3730 return 0;
3731 }
3732
3733 r = numa_to_cpu_set(&c->numa_policy, &s);
3734 if (r < 0)
3735 return r;
3736
3737 cpu_set_reset(ret);
3738
3739 return cpu_set_add_all(ret, &s);
3740 }
3741
3742 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3743 assert(c);
3744
3745 return c->cpu_affinity_from_numa;
3746 }
3747
3748 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3749 int r;
3750
3751 assert(fds);
3752 assert(n_fds);
3753 assert(*n_fds < fds_size);
3754 assert(ret_fd);
3755
3756 if (fd < 0) {
3757 *ret_fd = -EBADF;
3758 return 0;
3759 }
3760
3761 if (fd < 3 + (int) *n_fds) {
3762 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3763 * the fds we pass to the process (or which are closed only during execve). */
3764
3765 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3766 if (r < 0)
3767 return -errno;
3768
3769 close_and_replace(fd, r);
3770 }
3771
3772 *ret_fd = fds[*n_fds] = fd;
3773 (*n_fds) ++;
3774 return 1;
3775 }
3776
3777 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3778 union sockaddr_union addr = {
3779 .un.sun_family = AF_UNIX,
3780 };
3781 socklen_t sa_len;
3782 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3783 int r;
3784
3785 assert(u);
3786 assert(of);
3787 assert(ofd >= 0);
3788
3789 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3790 if (r < 0)
3791 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3792
3793 sa_len = r;
3794
3795 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3796 _cleanup_close_ int fd = -EBADF;
3797
3798 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3799 if (fd < 0)
3800 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3801
3802 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3803 if (r == -EPROTOTYPE)
3804 continue;
3805 if (r < 0)
3806 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3807
3808 return TAKE_FD(fd);
3809 }
3810
3811 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3812 }
3813
3814 static int get_open_file_fd(Unit *u, const OpenFile *of) {
3815 struct stat st;
3816 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3817
3818 assert(u);
3819 assert(of);
3820
3821 ofd = open(of->path, O_PATH | O_CLOEXEC);
3822 if (ofd < 0)
3823 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3824
3825 if (fstat(ofd, &st) < 0)
3826 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
3827
3828 if (S_ISSOCK(st.st_mode)) {
3829 fd = connect_unix_harder(u, of, ofd);
3830 if (fd < 0)
3831 return fd;
3832
3833 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3834 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3835 of->path);
3836
3837 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3838 } else {
3839 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3840 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3841 flags |= O_APPEND;
3842 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3843 flags |= O_TRUNC;
3844
3845 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3846 if (fd < 0)
3847 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3848
3849 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3850 }
3851
3852 return TAKE_FD(fd);
3853 }
3854
3855 static int collect_open_file_fds(
3856 Unit *u,
3857 OpenFile* open_files,
3858 int **fds,
3859 char ***fdnames,
3860 size_t *n_fds) {
3861 int r;
3862
3863 assert(u);
3864 assert(fds);
3865 assert(fdnames);
3866 assert(n_fds);
3867
3868 LIST_FOREACH(open_files, of, open_files) {
3869 _cleanup_close_ int fd = -EBADF;
3870
3871 fd = get_open_file_fd(u, of);
3872 if (fd < 0) {
3873 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3874 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3875 continue;
3876 }
3877
3878 return fd;
3879 }
3880
3881 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3882 return -ENOMEM;
3883
3884 r = strv_extend(fdnames, of->fdname);
3885 if (r < 0)
3886 return r;
3887
3888 (*fds)[*n_fds] = TAKE_FD(fd);
3889
3890 (*n_fds)++;
3891 }
3892
3893 return 0;
3894 }
3895
3896 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3897 assert(unit);
3898 assert(msg);
3899 assert(executable);
3900
3901 if (!DEBUG_LOGGING)
3902 return;
3903
3904 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3905
3906 log_unit_struct(unit, LOG_DEBUG,
3907 "EXECUTABLE=%s", executable,
3908 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3909 LOG_UNIT_INVOCATION_ID(unit));
3910 }
3911
3912 static bool exec_context_need_unprivileged_private_users(
3913 const ExecContext *context,
3914 const ExecParameters *params) {
3915
3916 assert(context);
3917 assert(params);
3918
3919 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3920 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3921 * (system manager) then we have privileges and don't need this. */
3922 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3923 return false;
3924
3925 return context->private_users ||
3926 context->private_tmp ||
3927 context->private_devices ||
3928 context->private_network ||
3929 context->network_namespace_path ||
3930 context->private_ipc ||
3931 context->ipc_namespace_path ||
3932 context->private_mounts > 0 ||
3933 context->mount_apivfs ||
3934 context->n_bind_mounts > 0 ||
3935 context->n_temporary_filesystems > 0 ||
3936 context->root_directory ||
3937 !strv_isempty(context->extension_directories) ||
3938 context->protect_system != PROTECT_SYSTEM_NO ||
3939 context->protect_home != PROTECT_HOME_NO ||
3940 context->protect_kernel_tunables ||
3941 context->protect_kernel_modules ||
3942 context->protect_kernel_logs ||
3943 context->protect_control_groups ||
3944 context->protect_clock ||
3945 context->protect_hostname ||
3946 !strv_isempty(context->read_write_paths) ||
3947 !strv_isempty(context->read_only_paths) ||
3948 !strv_isempty(context->inaccessible_paths) ||
3949 !strv_isempty(context->exec_paths) ||
3950 !strv_isempty(context->no_exec_paths);
3951 }
3952
3953 static int exec_child(
3954 Unit *unit,
3955 const ExecCommand *command,
3956 const ExecContext *context,
3957 const ExecParameters *params,
3958 ExecRuntime *runtime,
3959 const CGroupContext *cgroup_context,
3960 int socket_fd,
3961 const int named_iofds[static 3],
3962 int *params_fds,
3963 size_t n_socket_fds,
3964 size_t n_storage_fds,
3965 char **files_env,
3966 int user_lookup_fd,
3967 int *exit_status) {
3968
3969 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3970 int r, ngids = 0, exec_fd;
3971 _cleanup_free_ gid_t *supplementary_gids = NULL;
3972 const char *username = NULL, *groupname = NULL;
3973 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3974 const char *home = NULL, *shell = NULL;
3975 char **final_argv = NULL;
3976 dev_t journal_stream_dev = 0;
3977 ino_t journal_stream_ino = 0;
3978 bool userns_set_up = false;
3979 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3980 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3981 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3982 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3983 #if HAVE_SELINUX
3984 _cleanup_free_ char *mac_selinux_context_net = NULL;
3985 bool use_selinux = false;
3986 #endif
3987 #if ENABLE_SMACK
3988 bool use_smack = false;
3989 #endif
3990 #if HAVE_APPARMOR
3991 bool use_apparmor = false;
3992 #endif
3993 uid_t saved_uid = getuid();
3994 gid_t saved_gid = getgid();
3995 uid_t uid = UID_INVALID;
3996 gid_t gid = GID_INVALID;
3997 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3998 n_keep_fds; /* total number of fds not to close */
3999 int secure_bits;
4000 _cleanup_free_ gid_t *gids_after_pam = NULL;
4001 int ngids_after_pam = 0;
4002 _cleanup_free_ int *fds = NULL;
4003 _cleanup_strv_free_ char **fdnames = NULL;
4004
4005 assert(unit);
4006 assert(command);
4007 assert(context);
4008 assert(params);
4009 assert(exit_status);
4010
4011 /* Explicitly test for CVE-2021-4034 inspired invocations */
4012 assert(command->path);
4013 assert(!strv_isempty(command->argv));
4014
4015 rename_process_from_path(command->path);
4016
4017 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4018 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4019 * both of which will be demoted to SIG_DFL. */
4020 (void) default_signals(SIGNALS_CRASH_HANDLER,
4021 SIGNALS_IGNORE);
4022
4023 if (context->ignore_sigpipe)
4024 (void) ignore_signals(SIGPIPE);
4025
4026 r = reset_signal_mask();
4027 if (r < 0) {
4028 *exit_status = EXIT_SIGNAL_MASK;
4029 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4030 }
4031
4032 if (params->idle_pipe)
4033 do_idle_pipe_dance(params->idle_pipe);
4034
4035 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4036 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4037 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4038 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4039
4040 log_forget_fds();
4041 log_set_open_when_needed(true);
4042 log_settle_target();
4043 if (context->log_level_max >= 0)
4044 log_set_max_level(context->log_level_max);
4045
4046 /* In case anything used libc syslog(), close this here, too */
4047 closelog();
4048
4049 fds = newdup(int, params_fds, n_fds);
4050 if (!fds) {
4051 *exit_status = EXIT_MEMORY;
4052 return log_oom();
4053 }
4054
4055 fdnames = strv_copy((char**) params->fd_names);
4056 if (!fdnames) {
4057 *exit_status = EXIT_MEMORY;
4058 return log_oom();
4059 }
4060
4061 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4062 if (r < 0) {
4063 *exit_status = EXIT_FDS;
4064 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4065 }
4066
4067 int keep_fds[n_fds + 3];
4068 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4069 n_keep_fds = n_fds;
4070
4071 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4072 if (r < 0) {
4073 *exit_status = EXIT_FDS;
4074 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4075 }
4076
4077 #if HAVE_LIBBPF
4078 if (unit->manager->restrict_fs) {
4079 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4080 if (bpf_map_fd < 0) {
4081 *exit_status = EXIT_FDS;
4082 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4083 }
4084
4085 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4086 if (r < 0) {
4087 *exit_status = EXIT_FDS;
4088 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4089 }
4090 }
4091 #endif
4092
4093 r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4094 if (r < 0) {
4095 *exit_status = EXIT_FDS;
4096 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4097 }
4098
4099 if (!context->same_pgrp &&
4100 setsid() < 0) {
4101 *exit_status = EXIT_SETSID;
4102 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4103 }
4104
4105 exec_context_tty_reset(context, params);
4106
4107 if (unit_shall_confirm_spawn(unit)) {
4108 _cleanup_free_ char *cmdline = NULL;
4109
4110 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4111 if (!cmdline) {
4112 *exit_status = EXIT_MEMORY;
4113 return log_oom();
4114 }
4115
4116 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4117 if (r != CONFIRM_EXECUTE) {
4118 if (r == CONFIRM_PRETEND_SUCCESS) {
4119 *exit_status = EXIT_SUCCESS;
4120 return 0;
4121 }
4122
4123 *exit_status = EXIT_CONFIRM;
4124 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4125 "Execution cancelled by the user");
4126 }
4127 }
4128
4129 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4130 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4131 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4132 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4133 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4134 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4135 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4136 *exit_status = EXIT_MEMORY;
4137 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4138 }
4139
4140 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4141 _cleanup_strv_free_ char **suggested_paths = NULL;
4142
4143 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4144 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4145 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4146 *exit_status = EXIT_USER;
4147 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4148 }
4149
4150 r = compile_suggested_paths(context, params, &suggested_paths);
4151 if (r < 0) {
4152 *exit_status = EXIT_MEMORY;
4153 return log_oom();
4154 }
4155
4156 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4157 if (r < 0) {
4158 *exit_status = EXIT_USER;
4159 if (r == -EILSEQ)
4160 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4161 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4162 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4163 }
4164
4165 if (!uid_is_valid(uid)) {
4166 *exit_status = EXIT_USER;
4167 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4168 }
4169
4170 if (!gid_is_valid(gid)) {
4171 *exit_status = EXIT_USER;
4172 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4173 }
4174
4175 if (runtime->dynamic_creds->user)
4176 username = runtime->dynamic_creds->user->name;
4177
4178 } else {
4179 if (context->user) {
4180 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4181 if (r < 0) {
4182 *exit_status = EXIT_USER;
4183 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4184 }
4185 }
4186
4187 if (context->group) {
4188 r = get_fixed_group(context->group, &groupname, &gid);
4189 if (r < 0) {
4190 *exit_status = EXIT_GROUP;
4191 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4192 }
4193 }
4194 }
4195
4196 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4197 r = get_supplementary_groups(context, username, groupname, gid,
4198 &supplementary_gids, &ngids);
4199 if (r < 0) {
4200 *exit_status = EXIT_GROUP;
4201 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4202 }
4203
4204 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4205 if (r < 0) {
4206 *exit_status = EXIT_USER;
4207 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4208 }
4209
4210 user_lookup_fd = safe_close(user_lookup_fd);
4211
4212 r = acquire_home(context, uid, &home, &home_buffer);
4213 if (r < 0) {
4214 *exit_status = EXIT_CHDIR;
4215 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4216 }
4217
4218 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4219 if (socket_fd >= 0)
4220 (void) fd_nonblock(socket_fd, false);
4221
4222 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4223 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4224 if (params->cgroup_path) {
4225 _cleanup_free_ char *p = NULL;
4226
4227 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4228 if (r < 0) {
4229 *exit_status = EXIT_CGROUP;
4230 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4231 }
4232
4233 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4234 if (r == -EUCLEAN) {
4235 *exit_status = EXIT_CGROUP;
4236 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4237 "because the cgroup or one of its parents or "
4238 "siblings is in the threaded mode: %m", p);
4239 }
4240 if (r < 0) {
4241 *exit_status = EXIT_CGROUP;
4242 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4243 }
4244 }
4245
4246 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4247 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4248 if (r < 0) {
4249 *exit_status = EXIT_NETWORK;
4250 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4251 }
4252 }
4253
4254 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4255 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4256 if (r < 0) {
4257 *exit_status = EXIT_NAMESPACE;
4258 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4259 }
4260 }
4261
4262 r = setup_input(context, params, socket_fd, named_iofds);
4263 if (r < 0) {
4264 *exit_status = EXIT_STDIN;
4265 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4266 }
4267
4268 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4269 if (r < 0) {
4270 *exit_status = EXIT_STDOUT;
4271 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4272 }
4273
4274 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4275 if (r < 0) {
4276 *exit_status = EXIT_STDERR;
4277 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4278 }
4279
4280 if (context->oom_score_adjust_set) {
4281 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4282 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4283 r = set_oom_score_adjust(context->oom_score_adjust);
4284 if (ERRNO_IS_NEG_PRIVILEGE(r))
4285 log_unit_debug_errno(unit, r,
4286 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4287 else if (r < 0) {
4288 *exit_status = EXIT_OOM_ADJUST;
4289 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4290 }
4291 }
4292
4293 if (context->coredump_filter_set) {
4294 r = set_coredump_filter(context->coredump_filter);
4295 if (ERRNO_IS_NEG_PRIVILEGE(r))
4296 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4297 else if (r < 0) {
4298 *exit_status = EXIT_LIMITS;
4299 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4300 }
4301 }
4302
4303 if (context->nice_set) {
4304 r = setpriority_closest(context->nice);
4305 if (r < 0) {
4306 *exit_status = EXIT_NICE;
4307 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4308 }
4309 }
4310
4311 if (context->cpu_sched_set) {
4312 struct sched_param param = {
4313 .sched_priority = context->cpu_sched_priority,
4314 };
4315
4316 r = sched_setscheduler(0,
4317 context->cpu_sched_policy |
4318 (context->cpu_sched_reset_on_fork ?
4319 SCHED_RESET_ON_FORK : 0),
4320 &param);
4321 if (r < 0) {
4322 *exit_status = EXIT_SETSCHEDULER;
4323 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4324 }
4325 }
4326
4327 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4328 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4329 const CPUSet *cpu_set;
4330
4331 if (context->cpu_affinity_from_numa) {
4332 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4333 if (r < 0) {
4334 *exit_status = EXIT_CPUAFFINITY;
4335 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4336 }
4337
4338 cpu_set = &converted_cpu_set;
4339 } else
4340 cpu_set = &context->cpu_set;
4341
4342 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4343 *exit_status = EXIT_CPUAFFINITY;
4344 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4345 }
4346 }
4347
4348 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4349 r = apply_numa_policy(&context->numa_policy);
4350 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4351 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4352 else if (r < 0) {
4353 *exit_status = EXIT_NUMA_POLICY;
4354 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4355 }
4356 }
4357
4358 if (context->ioprio_set)
4359 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4360 *exit_status = EXIT_IOPRIO;
4361 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4362 }
4363
4364 if (context->timer_slack_nsec != NSEC_INFINITY)
4365 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4366 *exit_status = EXIT_TIMERSLACK;
4367 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4368 }
4369
4370 if (context->personality != PERSONALITY_INVALID) {
4371 r = safe_personality(context->personality);
4372 if (r < 0) {
4373 *exit_status = EXIT_PERSONALITY;
4374 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4375 }
4376 }
4377
4378 if (context->utmp_id) {
4379 const char *line = context->tty_path ?
4380 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4381 NULL;
4382 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4383 line,
4384 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4385 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4386 USER_PROCESS,
4387 username);
4388 }
4389
4390 if (uid_is_valid(uid)) {
4391 r = chown_terminal(STDIN_FILENO, uid);
4392 if (r < 0) {
4393 *exit_status = EXIT_STDIN;
4394 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4395 }
4396 }
4397
4398 if (params->cgroup_path) {
4399 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4400 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4401 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4402 * touch a single hierarchy too. */
4403
4404 if (params->flags & EXEC_CGROUP_DELEGATE) {
4405 _cleanup_free_ char *p = NULL;
4406
4407 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4408 if (r < 0) {
4409 *exit_status = EXIT_CGROUP;
4410 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4411 }
4412
4413 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4414 if (r < 0) {
4415 *exit_status = EXIT_CGROUP;
4416 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4417 }
4418 if (r > 0) {
4419 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4420 if (r < 0) {
4421 *exit_status = EXIT_CGROUP;
4422 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4423 }
4424 }
4425 }
4426
4427 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4428 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4429 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4430 if (r < 0) {
4431 *exit_status = EXIT_MEMORY;
4432 return log_oom();
4433 }
4434
4435 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4436 if (r < 0) {
4437 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4438 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4439 memory_pressure_path = mfree(memory_pressure_path);
4440 }
4441 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4442 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4443 if (!memory_pressure_path) {
4444 *exit_status = EXIT_MEMORY;
4445 return log_oom();
4446 }
4447 }
4448 }
4449 }
4450
4451 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4452
4453 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4454 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4455 if (r < 0)
4456 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4457 }
4458
4459 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4460 r = exec_setup_credentials(context, params, unit->id, uid, gid);
4461 if (r < 0) {
4462 *exit_status = EXIT_CREDENTIALS;
4463 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4464 }
4465 }
4466
4467 r = build_environment(
4468 unit,
4469 context,
4470 params,
4471 cgroup_context,
4472 n_fds,
4473 fdnames,
4474 home,
4475 username,
4476 shell,
4477 journal_stream_dev,
4478 journal_stream_ino,
4479 memory_pressure_path,
4480 &our_env);
4481 if (r < 0) {
4482 *exit_status = EXIT_MEMORY;
4483 return log_oom();
4484 }
4485
4486 r = build_pass_environment(context, &pass_env);
4487 if (r < 0) {
4488 *exit_status = EXIT_MEMORY;
4489 return log_oom();
4490 }
4491
4492 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4493 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4494 * not specify PATH but the unit has ExecSearchPath. */
4495 if (!strv_isempty(context->exec_search_path)) {
4496 _cleanup_free_ char *joined = NULL;
4497
4498 joined = strv_join(context->exec_search_path, ":");
4499 if (!joined) {
4500 *exit_status = EXIT_MEMORY;
4501 return log_oom();
4502 }
4503
4504 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4505 if (r < 0) {
4506 *exit_status = EXIT_MEMORY;
4507 return log_oom();
4508 }
4509 }
4510
4511 accum_env = strv_env_merge(params->environment,
4512 our_env,
4513 joined_exec_search_path,
4514 pass_env,
4515 context->environment,
4516 files_env);
4517 if (!accum_env) {
4518 *exit_status = EXIT_MEMORY;
4519 return log_oom();
4520 }
4521 accum_env = strv_env_clean(accum_env);
4522
4523 (void) umask(context->umask);
4524
4525 r = setup_keyring(unit, context, params, uid, gid);
4526 if (r < 0) {
4527 *exit_status = EXIT_KEYRING;
4528 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4529 }
4530
4531 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4532 * from it. */
4533 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4534
4535 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4536 * for it, and the kernel doesn't actually support ambient caps. */
4537 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4538
4539 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4540 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4541 * desired. */
4542 if (needs_ambient_hack)
4543 needs_setuid = false;
4544 else
4545 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4546
4547 uint64_t capability_ambient_set = context->capability_ambient_set;
4548
4549 if (needs_sandboxing) {
4550 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4551 * /sys being present. The actual MAC context application will happen later, as late as
4552 * possible, to avoid impacting our own code paths. */
4553
4554 #if HAVE_SELINUX
4555 use_selinux = mac_selinux_use();
4556 #endif
4557 #if ENABLE_SMACK
4558 use_smack = mac_smack_use();
4559 #endif
4560 #if HAVE_APPARMOR
4561 use_apparmor = mac_apparmor_use();
4562 #endif
4563 }
4564
4565 if (needs_sandboxing) {
4566 int which_failed;
4567
4568 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4569 * is set here. (See below.) */
4570
4571 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4572 if (r < 0) {
4573 *exit_status = EXIT_LIMITS;
4574 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4575 }
4576 }
4577
4578 if (needs_setuid && context->pam_name && username) {
4579 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4580 * wins here. (See above.) */
4581
4582 /* All fds passed in the fds array will be closed in the pam child process. */
4583 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4584 if (r < 0) {
4585 *exit_status = EXIT_PAM;
4586 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4587 }
4588
4589 if (ambient_capabilities_supported()) {
4590 uint64_t ambient_after_pam;
4591
4592 /* PAM modules might have set some ambient caps. Query them here and merge them into
4593 * the caps we want to set in the end, so that we don't end up unsetting them. */
4594 r = capability_get_ambient(&ambient_after_pam);
4595 if (r < 0) {
4596 *exit_status = EXIT_CAPABILITIES;
4597 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4598 }
4599
4600 capability_ambient_set |= ambient_after_pam;
4601 }
4602
4603 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4604 if (ngids_after_pam < 0) {
4605 *exit_status = EXIT_MEMORY;
4606 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4607 }
4608 }
4609
4610 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4611 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4612 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4613 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4614
4615 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4616 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4617 * the actual requested operations fail (or silently continue). */
4618 if (r < 0 && context->private_users) {
4619 *exit_status = EXIT_USER;
4620 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4621 }
4622 if (r < 0)
4623 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4624 else
4625 userns_set_up = true;
4626 }
4627
4628 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4629
4630 /* Try to enable network namespacing if network namespacing is available and we have
4631 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4632 * new network namespace. And if we don't have that, then we could only create a network
4633 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4634 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4635 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4636 if (ERRNO_IS_NEG_PRIVILEGE(r))
4637 log_unit_notice_errno(unit, r,
4638 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4639 else if (r < 0) {
4640 *exit_status = EXIT_NETWORK;
4641 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4642 }
4643 } else if (context->network_namespace_path) {
4644 *exit_status = EXIT_NETWORK;
4645 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4646 "NetworkNamespacePath= is not supported, refusing.");
4647 } else
4648 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4649 }
4650
4651 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4652
4653 if (ns_type_supported(NAMESPACE_IPC)) {
4654 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4655 if (r == -EPERM)
4656 log_unit_warning_errno(unit, r,
4657 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4658 else if (r < 0) {
4659 *exit_status = EXIT_NAMESPACE;
4660 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4661 }
4662 } else if (context->ipc_namespace_path) {
4663 *exit_status = EXIT_NAMESPACE;
4664 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4665 "IPCNamespacePath= is not supported, refusing.");
4666 } else
4667 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4668 }
4669
4670 if (needs_mount_namespace) {
4671 _cleanup_free_ char *error_path = NULL;
4672
4673 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
4674 if (r < 0) {
4675 *exit_status = EXIT_NAMESPACE;
4676 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4677 error_path ? ": " : "", strempty(error_path));
4678 }
4679 }
4680
4681 if (needs_sandboxing) {
4682 r = apply_protect_hostname(unit, context, exit_status);
4683 if (r < 0)
4684 return r;
4685 }
4686
4687 if (context->memory_ksm >= 0)
4688 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4689 if (ERRNO_IS_NOT_SUPPORTED(errno))
4690 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4691 else {
4692 *exit_status = EXIT_KSM;
4693 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4694 }
4695 }
4696
4697 /* Drop groups as early as possible.
4698 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4699 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4700 if (needs_setuid) {
4701 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4702 int ngids_to_enforce = 0;
4703
4704 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4705 ngids,
4706 gids_after_pam,
4707 ngids_after_pam,
4708 &gids_to_enforce);
4709 if (ngids_to_enforce < 0) {
4710 *exit_status = EXIT_MEMORY;
4711 return log_unit_error_errno(unit,
4712 ngids_to_enforce,
4713 "Failed to merge group lists. Group membership might be incorrect: %m");
4714 }
4715
4716 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4717 if (r < 0) {
4718 *exit_status = EXIT_GROUP;
4719 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4720 }
4721 }
4722
4723 /* If the user namespace was not set up above, try to do it now.
4724 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4725 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4726 * case of mount namespaces being less privileged when the mount point list is copied from a
4727 * different user namespace). */
4728
4729 if (needs_sandboxing && context->private_users && !userns_set_up) {
4730 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4731 if (r < 0) {
4732 *exit_status = EXIT_USER;
4733 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4734 }
4735 }
4736
4737 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4738 * shall execute. */
4739
4740 _cleanup_free_ char *executable = NULL;
4741 _cleanup_close_ int executable_fd = -EBADF;
4742 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4743 if (r < 0) {
4744 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4745 log_unit_struct_errno(unit, LOG_INFO, r,
4746 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4747 LOG_UNIT_INVOCATION_ID(unit),
4748 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4749 command->path),
4750 "EXECUTABLE=%s", command->path);
4751 *exit_status = EXIT_SUCCESS;
4752 return 0;
4753 }
4754
4755 *exit_status = EXIT_EXEC;
4756 return log_unit_struct_errno(unit, LOG_INFO, r,
4757 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4758 LOG_UNIT_INVOCATION_ID(unit),
4759 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4760 command->path),
4761 "EXECUTABLE=%s", command->path);
4762 }
4763
4764 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4765 if (r < 0) {
4766 *exit_status = EXIT_FDS;
4767 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4768 }
4769
4770 #if HAVE_SELINUX
4771 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4772 int fd = -EBADF;
4773
4774 if (socket_fd >= 0)
4775 fd = socket_fd;
4776 else if (params->n_socket_fds == 1)
4777 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4778 * use context from that fd to compute the label. */
4779 fd = params->fds[0];
4780
4781 if (fd >= 0) {
4782 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4783 if (r < 0) {
4784 if (!context->selinux_context_ignore) {
4785 *exit_status = EXIT_SELINUX_CONTEXT;
4786 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4787 }
4788 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4789 }
4790 }
4791 }
4792 #endif
4793
4794 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4795 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4796 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4797 * execve(). */
4798
4799 r = close_all_fds(keep_fds, n_keep_fds);
4800 if (r >= 0)
4801 r = shift_fds(fds, n_fds);
4802 if (r >= 0)
4803 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4804 if (r < 0) {
4805 *exit_status = EXIT_FDS;
4806 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4807 }
4808
4809 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4810 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4811 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4812 * came this far. */
4813
4814 secure_bits = context->secure_bits;
4815
4816 if (needs_sandboxing) {
4817 uint64_t bset;
4818
4819 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4820 * (Note this is placed after the general resource limit initialization, see above, in order
4821 * to take precedence.) */
4822 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4823 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4824 *exit_status = EXIT_LIMITS;
4825 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4826 }
4827 }
4828
4829 #if ENABLE_SMACK
4830 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4831 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4832 if (use_smack) {
4833 r = setup_smack(unit->manager, context, executable_fd);
4834 if (r < 0 && !context->smack_process_label_ignore) {
4835 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4836 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4837 }
4838 }
4839 #endif
4840
4841 bset = context->capability_bounding_set;
4842 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4843 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4844 * instead of us doing that */
4845 if (needs_ambient_hack)
4846 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4847 (UINT64_C(1) << CAP_SETUID) |
4848 (UINT64_C(1) << CAP_SETGID);
4849
4850 if (!cap_test_all(bset)) {
4851 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4852 if (r < 0) {
4853 *exit_status = EXIT_CAPABILITIES;
4854 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4855 }
4856 }
4857
4858 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4859 * keep-caps set.
4860 *
4861 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4862 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4863 * the ambient capabilities can be raised as they are present in the permitted and
4864 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4865 * without changing the user, so we also set the ambient capabilities here.
4866 *
4867 * The requested ambient capabilities are raised in the inheritable set if the second
4868 * argument is true. */
4869 if (!needs_ambient_hack) {
4870 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4871 if (r < 0) {
4872 *exit_status = EXIT_CAPABILITIES;
4873 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4874 }
4875 }
4876 }
4877
4878 /* chroot to root directory first, before we lose the ability to chroot */
4879 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4880 if (r < 0)
4881 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4882
4883 if (needs_setuid) {
4884 if (uid_is_valid(uid)) {
4885 r = enforce_user(context, uid, capability_ambient_set);
4886 if (r < 0) {
4887 *exit_status = EXIT_USER;
4888 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4889 }
4890
4891 if (!needs_ambient_hack && capability_ambient_set != 0) {
4892
4893 /* Raise the ambient capabilities after user change. */
4894 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4895 if (r < 0) {
4896 *exit_status = EXIT_CAPABILITIES;
4897 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4898 }
4899 }
4900 }
4901 }
4902
4903 /* Apply working directory here, because the working directory might be on NFS and only the user running
4904 * this service might have the correct privilege to change to the working directory */
4905 r = apply_working_directory(context, params, runtime, home, exit_status);
4906 if (r < 0)
4907 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4908
4909 if (needs_sandboxing) {
4910 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4911 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4912 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4913 * are restricted. */
4914
4915 #if HAVE_SELINUX
4916 if (use_selinux) {
4917 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4918
4919 if (exec_context) {
4920 r = setexeccon(exec_context);
4921 if (r < 0) {
4922 if (!context->selinux_context_ignore) {
4923 *exit_status = EXIT_SELINUX_CONTEXT;
4924 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4925 }
4926 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4927 }
4928 }
4929 }
4930 #endif
4931
4932 #if HAVE_APPARMOR
4933 if (use_apparmor && context->apparmor_profile) {
4934 r = aa_change_onexec(context->apparmor_profile);
4935 if (r < 0 && !context->apparmor_profile_ignore) {
4936 *exit_status = EXIT_APPARMOR_PROFILE;
4937 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4938 }
4939 }
4940 #endif
4941
4942 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4943 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4944 * requires CAP_SETPCAP. */
4945 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4946 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4947 * effective set here.
4948 *
4949 * The effective set is overwritten during execve() with the following values:
4950 *
4951 * - ambient set (for non-root processes)
4952 *
4953 * - (inheritable | bounding) set for root processes)
4954 *
4955 * Hence there is no security impact to raise it in the effective set before execve
4956 */
4957 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4958 if (r < 0) {
4959 *exit_status = EXIT_CAPABILITIES;
4960 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4961 }
4962 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4963 *exit_status = EXIT_SECUREBITS;
4964 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4965 }
4966 }
4967
4968 if (context_has_no_new_privileges(context))
4969 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4970 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4971 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4972 }
4973
4974 #if HAVE_SECCOMP
4975 r = apply_address_families(unit, context);
4976 if (r < 0) {
4977 *exit_status = EXIT_ADDRESS_FAMILIES;
4978 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4979 }
4980
4981 r = apply_memory_deny_write_execute(unit, context);
4982 if (r < 0) {
4983 *exit_status = EXIT_SECCOMP;
4984 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4985 }
4986
4987 r = apply_restrict_realtime(unit, context);
4988 if (r < 0) {
4989 *exit_status = EXIT_SECCOMP;
4990 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4991 }
4992
4993 r = apply_restrict_suid_sgid(unit, context);
4994 if (r < 0) {
4995 *exit_status = EXIT_SECCOMP;
4996 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4997 }
4998
4999 r = apply_restrict_namespaces(unit, context);
5000 if (r < 0) {
5001 *exit_status = EXIT_SECCOMP;
5002 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5003 }
5004
5005 r = apply_protect_sysctl(unit, context);
5006 if (r < 0) {
5007 *exit_status = EXIT_SECCOMP;
5008 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5009 }
5010
5011 r = apply_protect_kernel_modules(unit, context);
5012 if (r < 0) {
5013 *exit_status = EXIT_SECCOMP;
5014 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5015 }
5016
5017 r = apply_protect_kernel_logs(unit, context);
5018 if (r < 0) {
5019 *exit_status = EXIT_SECCOMP;
5020 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5021 }
5022
5023 r = apply_protect_clock(unit, context);
5024 if (r < 0) {
5025 *exit_status = EXIT_SECCOMP;
5026 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5027 }
5028
5029 r = apply_private_devices(unit, context);
5030 if (r < 0) {
5031 *exit_status = EXIT_SECCOMP;
5032 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5033 }
5034
5035 r = apply_syscall_archs(unit, context);
5036 if (r < 0) {
5037 *exit_status = EXIT_SECCOMP;
5038 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5039 }
5040
5041 r = apply_lock_personality(unit, context);
5042 if (r < 0) {
5043 *exit_status = EXIT_SECCOMP;
5044 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5045 }
5046
5047 r = apply_syscall_log(unit, context);
5048 if (r < 0) {
5049 *exit_status = EXIT_SECCOMP;
5050 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5051 }
5052
5053 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5054 * by the filter as little as possible. */
5055 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5056 if (r < 0) {
5057 *exit_status = EXIT_SECCOMP;
5058 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5059 }
5060 #endif
5061
5062 #if HAVE_LIBBPF
5063 r = apply_restrict_filesystems(unit, context);
5064 if (r < 0) {
5065 *exit_status = EXIT_BPF;
5066 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5067 }
5068 #endif
5069
5070 }
5071
5072 if (!strv_isempty(context->unset_environment)) {
5073 char **ee = NULL;
5074
5075 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5076 if (!ee) {
5077 *exit_status = EXIT_MEMORY;
5078 return log_oom();
5079 }
5080
5081 strv_free_and_replace(accum_env, ee);
5082 }
5083
5084 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5085 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5086
5087 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5088 if (r < 0) {
5089 *exit_status = EXIT_MEMORY;
5090 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5091 }
5092 final_argv = replaced_argv;
5093
5094 if (!strv_isempty(unset_variables)) {
5095 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5096 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5097 }
5098
5099 if (!strv_isempty(bad_variables)) {
5100 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5101 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5102 }
5103 } else
5104 final_argv = command->argv;
5105
5106 log_command_line(unit, "Executing", executable, final_argv);
5107
5108 if (exec_fd >= 0) {
5109 uint8_t hot = 1;
5110
5111 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5112 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5113
5114 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5115 *exit_status = EXIT_EXEC;
5116 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5117 }
5118 }
5119
5120 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5121
5122 if (exec_fd >= 0) {
5123 uint8_t hot = 0;
5124
5125 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5126 * that POLLHUP on it no longer means execve() succeeded. */
5127
5128 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5129 *exit_status = EXIT_EXEC;
5130 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5131 }
5132 }
5133
5134 *exit_status = EXIT_EXEC;
5135 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5136 }
5137
5138 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5139 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5140
5141 int exec_spawn(Unit *unit,
5142 ExecCommand *command,
5143 const ExecContext *context,
5144 const ExecParameters *params,
5145 ExecRuntime *runtime,
5146 const CGroupContext *cgroup_context,
5147 pid_t *ret) {
5148
5149 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5150 _cleanup_free_ char *subcgroup_path = NULL;
5151 _cleanup_strv_free_ char **files_env = NULL;
5152 size_t n_storage_fds = 0, n_socket_fds = 0;
5153 pid_t pid;
5154
5155 assert(unit);
5156 assert(command);
5157 assert(context);
5158 assert(ret);
5159 assert(params);
5160 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5161
5162 LOG_CONTEXT_PUSH_UNIT(unit);
5163
5164 if (context->std_input == EXEC_INPUT_SOCKET ||
5165 context->std_output == EXEC_OUTPUT_SOCKET ||
5166 context->std_error == EXEC_OUTPUT_SOCKET) {
5167
5168 if (params->n_socket_fds > 1)
5169 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5170
5171 if (params->n_socket_fds == 0)
5172 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5173
5174 socket_fd = params->fds[0];
5175 } else {
5176 socket_fd = -EBADF;
5177 fds = params->fds;
5178 n_socket_fds = params->n_socket_fds;
5179 n_storage_fds = params->n_storage_fds;
5180 }
5181
5182 r = exec_context_named_iofds(context, params, named_iofds);
5183 if (r < 0)
5184 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5185
5186 r = exec_context_load_environment(unit, context, &files_env);
5187 if (r < 0)
5188 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5189
5190 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5191 and, until the next SELinux policy changes, we save further reloads in future children. */
5192 mac_selinux_maybe_reload();
5193
5194 /* We won't know the real executable path until we create the mount namespace in the child, but we
5195 want to log from the parent, so we use the possibly inaccurate path here. */
5196 log_command_line(unit, "About to execute", command->path, command->argv);
5197
5198 if (params->cgroup_path) {
5199 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
5200 if (r < 0)
5201 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5202 if (r > 0) {
5203 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5204 * realized by the unit logic) */
5205
5206 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5207 if (r < 0)
5208 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
5209 }
5210 }
5211
5212 pid = fork();
5213 if (pid < 0)
5214 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5215
5216 if (pid == 0) {
5217 int exit_status;
5218
5219 r = exec_child(unit,
5220 command,
5221 context,
5222 params,
5223 runtime,
5224 cgroup_context,
5225 socket_fd,
5226 named_iofds,
5227 fds,
5228 n_socket_fds,
5229 n_storage_fds,
5230 files_env,
5231 unit->manager->user_lookup_fds[1],
5232 &exit_status);
5233
5234 if (r < 0) {
5235 const char *status = ASSERT_PTR(
5236 exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
5237
5238 log_unit_struct_errno(unit, LOG_ERR, r,
5239 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5240 LOG_UNIT_INVOCATION_ID(unit),
5241 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5242 status, command->path),
5243 "EXECUTABLE=%s", command->path);
5244 } else
5245 assert(exit_status == EXIT_SUCCESS);
5246
5247 _exit(exit_status);
5248 }
5249
5250 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5251
5252 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5253 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5254 * process will be killed too). */
5255 if (subcgroup_path)
5256 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5257
5258 exec_status_start(&command->exec_status, pid);
5259
5260 *ret = pid;
5261 return 0;
5262 }
5263
5264 void exec_context_init(ExecContext *c) {
5265 assert(c);
5266
5267 *c = (ExecContext) {
5268 .umask = 0022,
5269 .ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO,
5270 .cpu_sched_policy = SCHED_OTHER,
5271 .syslog_priority = LOG_DAEMON|LOG_INFO,
5272 .syslog_level_prefix = true,
5273 .ignore_sigpipe = true,
5274 .timer_slack_nsec = NSEC_INFINITY,
5275 .personality = PERSONALITY_INVALID,
5276 .timeout_clean_usec = USEC_INFINITY,
5277 .capability_bounding_set = CAP_MASK_UNSET,
5278 .restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
5279 .log_level_max = -1,
5280 #if HAVE_SECCOMP
5281 .syscall_errno = SECCOMP_ERROR_NUMBER_KILL,
5282 #endif
5283 .tty_rows = UINT_MAX,
5284 .tty_cols = UINT_MAX,
5285 .private_mounts = -1,
5286 .memory_ksm = -1,
5287 .set_login_environment = -1,
5288 };
5289
5290 FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
5291 d->mode = 0755;
5292
5293 numa_policy_reset(&c->numa_policy);
5294
5295 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5296 }
5297
5298 void exec_context_done(ExecContext *c) {
5299 assert(c);
5300
5301 c->environment = strv_free(c->environment);
5302 c->environment_files = strv_free(c->environment_files);
5303 c->pass_environment = strv_free(c->pass_environment);
5304 c->unset_environment = strv_free(c->unset_environment);
5305
5306 rlimit_free_all(c->rlimit);
5307
5308 for (size_t l = 0; l < 3; l++) {
5309 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5310 c->stdio_file[l] = mfree(c->stdio_file[l]);
5311 }
5312
5313 c->working_directory = mfree(c->working_directory);
5314 c->root_directory = mfree(c->root_directory);
5315 c->root_image = mfree(c->root_image);
5316 c->root_image_options = mount_options_free_all(c->root_image_options);
5317 c->root_hash = mfree(c->root_hash);
5318 c->root_hash_size = 0;
5319 c->root_hash_path = mfree(c->root_hash_path);
5320 c->root_hash_sig = mfree(c->root_hash_sig);
5321 c->root_hash_sig_size = 0;
5322 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5323 c->root_verity = mfree(c->root_verity);
5324 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5325 c->extension_directories = strv_free(c->extension_directories);
5326 c->tty_path = mfree(c->tty_path);
5327 c->syslog_identifier = mfree(c->syslog_identifier);
5328 c->user = mfree(c->user);
5329 c->group = mfree(c->group);
5330
5331 c->supplementary_groups = strv_free(c->supplementary_groups);
5332
5333 c->pam_name = mfree(c->pam_name);
5334
5335 c->read_only_paths = strv_free(c->read_only_paths);
5336 c->read_write_paths = strv_free(c->read_write_paths);
5337 c->inaccessible_paths = strv_free(c->inaccessible_paths);
5338 c->exec_paths = strv_free(c->exec_paths);
5339 c->no_exec_paths = strv_free(c->no_exec_paths);
5340 c->exec_search_path = strv_free(c->exec_search_path);
5341
5342 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5343 c->bind_mounts = NULL;
5344 c->n_bind_mounts = 0;
5345 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5346 c->temporary_filesystems = NULL;
5347 c->n_temporary_filesystems = 0;
5348 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5349
5350 cpu_set_reset(&c->cpu_set);
5351 numa_policy_reset(&c->numa_policy);
5352
5353 c->utmp_id = mfree(c->utmp_id);
5354 c->selinux_context = mfree(c->selinux_context);
5355 c->apparmor_profile = mfree(c->apparmor_profile);
5356 c->smack_process_label = mfree(c->smack_process_label);
5357
5358 c->restrict_filesystems = set_free_free(c->restrict_filesystems);
5359
5360 c->syscall_filter = hashmap_free(c->syscall_filter);
5361 c->syscall_archs = set_free(c->syscall_archs);
5362 c->address_families = set_free(c->address_families);
5363
5364 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5365 exec_directory_done(&c->directories[t]);
5366
5367 c->log_level_max = -1;
5368
5369 exec_context_free_log_extra_fields(c);
5370 c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5371 c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
5372
5373 c->log_ratelimit_interval_usec = 0;
5374 c->log_ratelimit_burst = 0;
5375
5376 c->stdin_data = mfree(c->stdin_data);
5377 c->stdin_data_size = 0;
5378
5379 c->network_namespace_path = mfree(c->network_namespace_path);
5380 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5381
5382 c->log_namespace = mfree(c->log_namespace);
5383
5384 c->load_credentials = hashmap_free(c->load_credentials);
5385 c->set_credentials = hashmap_free(c->set_credentials);
5386 c->import_credentials = set_free_free(c->import_credentials);
5387
5388 c->root_image_policy = image_policy_free(c->root_image_policy);
5389 c->mount_image_policy = image_policy_free(c->mount_image_policy);
5390 c->extension_image_policy = image_policy_free(c->extension_image_policy);
5391 }
5392
5393 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5394 assert(c);
5395
5396 if (!runtime_prefix)
5397 return 0;
5398
5399 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5400 _cleanup_free_ char *p = NULL;
5401
5402 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5403 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5404 else
5405 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5406 if (!p)
5407 return -ENOMEM;
5408
5409 /* We execute this synchronously, since we need to be sure this is gone when we start the
5410 * service next. */
5411 (void) rm_rf(p, REMOVE_ROOT);
5412
5413 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5414 _cleanup_free_ char *symlink_abs = NULL;
5415
5416 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5417 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5418 else
5419 symlink_abs = path_join(runtime_prefix, *symlink);
5420 if (!symlink_abs)
5421 return -ENOMEM;
5422
5423 (void) unlink(symlink_abs);
5424 }
5425 }
5426
5427 return 0;
5428 }
5429
5430 int exec_context_destroy_mount_ns_dir(Unit *u) {
5431 _cleanup_free_ char *p = NULL;
5432
5433 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5434 return 0;
5435
5436 p = path_join("/run/systemd/propagate/", u->id);
5437 if (!p)
5438 return -ENOMEM;
5439
5440 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5441 if (rmdir(p) < 0 && errno != ENOENT)
5442 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5443
5444 return 0;
5445 }
5446
5447 static void exec_command_done(ExecCommand *c) {
5448 assert(c);
5449
5450 c->path = mfree(c->path);
5451 c->argv = strv_free(c->argv);
5452 }
5453
5454 void exec_command_done_array(ExecCommand *c, size_t n) {
5455 for (size_t i = 0; i < n; i++)
5456 exec_command_done(c+i);
5457 }
5458
5459 ExecCommand* exec_command_free_list(ExecCommand *c) {
5460 ExecCommand *i;
5461
5462 while ((i = LIST_POP(command, c))) {
5463 exec_command_done(i);
5464 free(i);
5465 }
5466
5467 return NULL;
5468 }
5469
5470 void exec_command_free_array(ExecCommand **c, size_t n) {
5471 for (size_t i = 0; i < n; i++)
5472 c[i] = exec_command_free_list(c[i]);
5473 }
5474
5475 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5476 for (size_t i = 0; i < n; i++)
5477 exec_status_reset(&c[i].exec_status);
5478 }
5479
5480 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5481 for (size_t i = 0; i < n; i++)
5482 LIST_FOREACH(command, z, c[i])
5483 exec_status_reset(&z->exec_status);
5484 }
5485
5486 typedef struct InvalidEnvInfo {
5487 const Unit *unit;
5488 const char *path;
5489 } InvalidEnvInfo;
5490
5491 static void invalid_env(const char *p, void *userdata) {
5492 InvalidEnvInfo *info = userdata;
5493
5494 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5495 }
5496
5497 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5498 assert(c);
5499
5500 switch (fd_index) {
5501
5502 case STDIN_FILENO:
5503 if (c->std_input != EXEC_INPUT_NAMED_FD)
5504 return NULL;
5505
5506 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5507
5508 case STDOUT_FILENO:
5509 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5510 return NULL;
5511
5512 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5513
5514 case STDERR_FILENO:
5515 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5516 return NULL;
5517
5518 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5519
5520 default:
5521 return NULL;
5522 }
5523 }
5524
5525 static int exec_context_named_iofds(
5526 const ExecContext *c,
5527 const ExecParameters *p,
5528 int named_iofds[static 3]) {
5529
5530 size_t targets;
5531 const char* stdio_fdname[3];
5532 size_t n_fds;
5533
5534 assert(c);
5535 assert(p);
5536 assert(named_iofds);
5537
5538 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5539 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5540 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5541
5542 for (size_t i = 0; i < 3; i++)
5543 stdio_fdname[i] = exec_context_fdname(c, i);
5544
5545 n_fds = p->n_storage_fds + p->n_socket_fds;
5546
5547 for (size_t i = 0; i < n_fds && targets > 0; i++)
5548 if (named_iofds[STDIN_FILENO] < 0 &&
5549 c->std_input == EXEC_INPUT_NAMED_FD &&
5550 stdio_fdname[STDIN_FILENO] &&
5551 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5552
5553 named_iofds[STDIN_FILENO] = p->fds[i];
5554 targets--;
5555
5556 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5557 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5558 stdio_fdname[STDOUT_FILENO] &&
5559 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5560
5561 named_iofds[STDOUT_FILENO] = p->fds[i];
5562 targets--;
5563
5564 } else if (named_iofds[STDERR_FILENO] < 0 &&
5565 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5566 stdio_fdname[STDERR_FILENO] &&
5567 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5568
5569 named_iofds[STDERR_FILENO] = p->fds[i];
5570 targets--;
5571 }
5572
5573 return targets == 0 ? 0 : -ENOENT;
5574 }
5575
5576 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5577 _cleanup_strv_free_ char **v = NULL;
5578 int r;
5579
5580 assert(c);
5581 assert(ret);
5582
5583 STRV_FOREACH(i, c->environment_files) {
5584 _cleanup_globfree_ glob_t pglob = {};
5585 bool ignore = false;
5586 char *fn = *i;
5587
5588 if (fn[0] == '-') {
5589 ignore = true;
5590 fn++;
5591 }
5592
5593 if (!path_is_absolute(fn)) {
5594 if (ignore)
5595 continue;
5596 return -EINVAL;
5597 }
5598
5599 /* Filename supports globbing, take all matching files */
5600 r = safe_glob(fn, 0, &pglob);
5601 if (r < 0) {
5602 if (ignore)
5603 continue;
5604 return r;
5605 }
5606
5607 /* When we don't match anything, -ENOENT should be returned */
5608 assert(pglob.gl_pathc > 0);
5609
5610 for (size_t n = 0; n < pglob.gl_pathc; n++) {
5611 _cleanup_strv_free_ char **p = NULL;
5612
5613 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5614 if (r < 0) {
5615 if (ignore)
5616 continue;
5617 return r;
5618 }
5619
5620 /* Log invalid environment variables with filename */
5621 if (p) {
5622 InvalidEnvInfo info = {
5623 .unit = unit,
5624 .path = pglob.gl_pathv[n]
5625 };
5626
5627 p = strv_env_clean_with_callback(p, invalid_env, &info);
5628 }
5629
5630 if (!v)
5631 v = TAKE_PTR(p);
5632 else {
5633 char **m = strv_env_merge(v, p);
5634 if (!m)
5635 return -ENOMEM;
5636
5637 strv_free_and_replace(v, m);
5638 }
5639 }
5640 }
5641
5642 *ret = TAKE_PTR(v);
5643
5644 return 0;
5645 }
5646
5647 static bool tty_may_match_dev_console(const char *tty) {
5648 _cleanup_free_ char *resolved = NULL;
5649
5650 if (!tty)
5651 return true;
5652
5653 tty = skip_dev_prefix(tty);
5654
5655 /* trivial identity? */
5656 if (streq(tty, "console"))
5657 return true;
5658
5659 if (resolve_dev_console(&resolved) < 0)
5660 return true; /* if we could not resolve, assume it may */
5661
5662 /* "tty0" means the active VC, so it may be the same sometimes */
5663 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5664 }
5665
5666 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5667 assert(ec);
5668
5669 return ec->tty_reset ||
5670 ec->tty_vhangup ||
5671 ec->tty_vt_disallocate ||
5672 is_terminal_input(ec->std_input) ||
5673 is_terminal_output(ec->std_output) ||
5674 is_terminal_output(ec->std_error);
5675 }
5676
5677 bool exec_context_may_touch_console(const ExecContext *ec) {
5678
5679 return exec_context_may_touch_tty(ec) &&
5680 tty_may_match_dev_console(exec_context_tty_path(ec));
5681 }
5682
5683 static void strv_fprintf(FILE *f, char **l) {
5684 assert(f);
5685
5686 STRV_FOREACH(g, l)
5687 fprintf(f, " %s", *g);
5688 }
5689
5690 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5691 assert(f);
5692 assert(prefix);
5693 assert(name);
5694
5695 if (!strv_isempty(strv)) {
5696 fprintf(f, "%s%s:", prefix, name);
5697 strv_fprintf(f, strv);
5698 fputs("\n", f);
5699 }
5700 }
5701
5702 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5703 int r;
5704
5705 assert(c);
5706 assert(f);
5707
5708 prefix = strempty(prefix);
5709
5710 fprintf(f,
5711 "%sUMask: %04o\n"
5712 "%sWorkingDirectory: %s\n"
5713 "%sRootDirectory: %s\n"
5714 "%sRootEphemeral: %s\n"
5715 "%sNonBlocking: %s\n"
5716 "%sPrivateTmp: %s\n"
5717 "%sPrivateDevices: %s\n"
5718 "%sProtectKernelTunables: %s\n"
5719 "%sProtectKernelModules: %s\n"
5720 "%sProtectKernelLogs: %s\n"
5721 "%sProtectClock: %s\n"
5722 "%sProtectControlGroups: %s\n"
5723 "%sPrivateNetwork: %s\n"
5724 "%sPrivateUsers: %s\n"
5725 "%sProtectHome: %s\n"
5726 "%sProtectSystem: %s\n"
5727 "%sMountAPIVFS: %s\n"
5728 "%sIgnoreSIGPIPE: %s\n"
5729 "%sMemoryDenyWriteExecute: %s\n"
5730 "%sRestrictRealtime: %s\n"
5731 "%sRestrictSUIDSGID: %s\n"
5732 "%sKeyringMode: %s\n"
5733 "%sProtectHostname: %s\n"
5734 "%sProtectProc: %s\n"
5735 "%sProcSubset: %s\n",
5736 prefix, c->umask,
5737 prefix, empty_to_root(c->working_directory),
5738 prefix, empty_to_root(c->root_directory),
5739 prefix, yes_no(c->root_ephemeral),
5740 prefix, yes_no(c->non_blocking),
5741 prefix, yes_no(c->private_tmp),
5742 prefix, yes_no(c->private_devices),
5743 prefix, yes_no(c->protect_kernel_tunables),
5744 prefix, yes_no(c->protect_kernel_modules),
5745 prefix, yes_no(c->protect_kernel_logs),
5746 prefix, yes_no(c->protect_clock),
5747 prefix, yes_no(c->protect_control_groups),
5748 prefix, yes_no(c->private_network),
5749 prefix, yes_no(c->private_users),
5750 prefix, protect_home_to_string(c->protect_home),
5751 prefix, protect_system_to_string(c->protect_system),
5752 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5753 prefix, yes_no(c->ignore_sigpipe),
5754 prefix, yes_no(c->memory_deny_write_execute),
5755 prefix, yes_no(c->restrict_realtime),
5756 prefix, yes_no(c->restrict_suid_sgid),
5757 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5758 prefix, yes_no(c->protect_hostname),
5759 prefix, protect_proc_to_string(c->protect_proc),
5760 prefix, proc_subset_to_string(c->proc_subset));
5761
5762 if (c->root_image)
5763 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5764
5765 if (c->root_image_options) {
5766 fprintf(f, "%sRootImageOptions:", prefix);
5767 LIST_FOREACH(mount_options, o, c->root_image_options)
5768 if (!isempty(o->options))
5769 fprintf(f, " %s:%s",
5770 partition_designator_to_string(o->partition_designator),
5771 o->options);
5772 fprintf(f, "\n");
5773 }
5774
5775 if (c->root_hash) {
5776 _cleanup_free_ char *encoded = NULL;
5777 encoded = hexmem(c->root_hash, c->root_hash_size);
5778 if (encoded)
5779 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5780 }
5781
5782 if (c->root_hash_path)
5783 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5784
5785 if (c->root_hash_sig) {
5786 _cleanup_free_ char *encoded = NULL;
5787 ssize_t len;
5788 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5789 if (len)
5790 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5791 }
5792
5793 if (c->root_hash_sig_path)
5794 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5795
5796 if (c->root_verity)
5797 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5798
5799 STRV_FOREACH(e, c->environment)
5800 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5801
5802 STRV_FOREACH(e, c->environment_files)
5803 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5804
5805 STRV_FOREACH(e, c->pass_environment)
5806 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5807
5808 STRV_FOREACH(e, c->unset_environment)
5809 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5810
5811 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5812
5813 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5814 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5815
5816 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5817 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5818
5819 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5820 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5821 }
5822 }
5823
5824 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5825
5826 if (c->nice_set)
5827 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5828
5829 if (c->oom_score_adjust_set)
5830 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5831
5832 if (c->coredump_filter_set)
5833 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5834
5835 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5836 if (c->rlimit[i]) {
5837 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5838 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5839 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5840 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5841 }
5842
5843 if (c->ioprio_set) {
5844 _cleanup_free_ char *class_str = NULL;
5845
5846 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5847 if (r >= 0)
5848 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5849
5850 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5851 }
5852
5853 if (c->cpu_sched_set) {
5854 _cleanup_free_ char *policy_str = NULL;
5855
5856 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5857 if (r >= 0)
5858 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5859
5860 fprintf(f,
5861 "%sCPUSchedulingPriority: %i\n"
5862 "%sCPUSchedulingResetOnFork: %s\n",
5863 prefix, c->cpu_sched_priority,
5864 prefix, yes_no(c->cpu_sched_reset_on_fork));
5865 }
5866
5867 if (c->cpu_set.set) {
5868 _cleanup_free_ char *affinity = NULL;
5869
5870 affinity = cpu_set_to_range_string(&c->cpu_set);
5871 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5872 }
5873
5874 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5875 _cleanup_free_ char *nodes = NULL;
5876
5877 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5878 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5879 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5880 }
5881
5882 if (c->timer_slack_nsec != NSEC_INFINITY)
5883 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5884
5885 fprintf(f,
5886 "%sStandardInput: %s\n"
5887 "%sStandardOutput: %s\n"
5888 "%sStandardError: %s\n",
5889 prefix, exec_input_to_string(c->std_input),
5890 prefix, exec_output_to_string(c->std_output),
5891 prefix, exec_output_to_string(c->std_error));
5892
5893 if (c->std_input == EXEC_INPUT_NAMED_FD)
5894 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5895 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5896 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5897 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5898 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5899
5900 if (c->std_input == EXEC_INPUT_FILE)
5901 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5902 if (c->std_output == EXEC_OUTPUT_FILE)
5903 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5904 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5905 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5906 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5907 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5908 if (c->std_error == EXEC_OUTPUT_FILE)
5909 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5910 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5911 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5912 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5913 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5914
5915 if (c->tty_path)
5916 fprintf(f,
5917 "%sTTYPath: %s\n"
5918 "%sTTYReset: %s\n"
5919 "%sTTYVHangup: %s\n"
5920 "%sTTYVTDisallocate: %s\n"
5921 "%sTTYRows: %u\n"
5922 "%sTTYColumns: %u\n",
5923 prefix, c->tty_path,
5924 prefix, yes_no(c->tty_reset),
5925 prefix, yes_no(c->tty_vhangup),
5926 prefix, yes_no(c->tty_vt_disallocate),
5927 prefix, c->tty_rows,
5928 prefix, c->tty_cols);
5929
5930 if (IN_SET(c->std_output,
5931 EXEC_OUTPUT_KMSG,
5932 EXEC_OUTPUT_JOURNAL,
5933 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5934 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5935 IN_SET(c->std_error,
5936 EXEC_OUTPUT_KMSG,
5937 EXEC_OUTPUT_JOURNAL,
5938 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5939 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5940
5941 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5942
5943 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5944 if (r >= 0)
5945 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5946
5947 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5948 if (r >= 0)
5949 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5950 }
5951
5952 if (c->log_level_max >= 0) {
5953 _cleanup_free_ char *t = NULL;
5954
5955 (void) log_level_to_string_alloc(c->log_level_max, &t);
5956
5957 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5958 }
5959
5960 if (c->log_ratelimit_interval_usec > 0)
5961 fprintf(f,
5962 "%sLogRateLimitIntervalSec: %s\n",
5963 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5964
5965 if (c->log_ratelimit_burst > 0)
5966 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5967
5968 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5969 fprintf(f, "%sLogFilterPatterns:", prefix);
5970
5971 char *pattern;
5972 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
5973 fprintf(f, " %s", pattern);
5974 SET_FOREACH(pattern, c->log_filter_denied_patterns)
5975 fprintf(f, " ~%s", pattern);
5976 fputc('\n', f);
5977 }
5978
5979 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5980 fprintf(f, "%sLogExtraFields: ", prefix);
5981 fwrite(c->log_extra_fields[j].iov_base,
5982 1, c->log_extra_fields[j].iov_len,
5983 f);
5984 fputc('\n', f);
5985 }
5986
5987 if (c->log_namespace)
5988 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5989
5990 if (c->secure_bits) {
5991 _cleanup_free_ char *str = NULL;
5992
5993 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5994 if (r >= 0)
5995 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5996 }
5997
5998 if (c->capability_bounding_set != CAP_MASK_UNSET) {
5999 _cleanup_free_ char *str = NULL;
6000
6001 r = capability_set_to_string(c->capability_bounding_set, &str);
6002 if (r >= 0)
6003 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6004 }
6005
6006 if (c->capability_ambient_set != 0) {
6007 _cleanup_free_ char *str = NULL;
6008
6009 r = capability_set_to_string(c->capability_ambient_set, &str);
6010 if (r >= 0)
6011 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6012 }
6013
6014 if (c->user)
6015 fprintf(f, "%sUser: %s\n", prefix, c->user);
6016 if (c->group)
6017 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6018
6019 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6020
6021 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6022
6023 if (c->pam_name)
6024 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6025
6026 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6027 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6028 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6029 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6030 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6031 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6032
6033 for (size_t i = 0; i < c->n_bind_mounts; i++)
6034 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6035 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6036 c->bind_mounts[i].ignore_enoent ? "-": "",
6037 c->bind_mounts[i].source,
6038 c->bind_mounts[i].destination,
6039 c->bind_mounts[i].recursive ? "rbind" : "norbind");
6040
6041 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6042 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6043
6044 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6045 t->path,
6046 isempty(t->options) ? "" : ":",
6047 strempty(t->options));
6048 }
6049
6050 if (c->utmp_id)
6051 fprintf(f,
6052 "%sUtmpIdentifier: %s\n",
6053 prefix, c->utmp_id);
6054
6055 if (c->selinux_context)
6056 fprintf(f,
6057 "%sSELinuxContext: %s%s\n",
6058 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6059
6060 if (c->apparmor_profile)
6061 fprintf(f,
6062 "%sAppArmorProfile: %s%s\n",
6063 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6064
6065 if (c->smack_process_label)
6066 fprintf(f,
6067 "%sSmackProcessLabel: %s%s\n",
6068 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6069
6070 if (c->personality != PERSONALITY_INVALID)
6071 fprintf(f,
6072 "%sPersonality: %s\n",
6073 prefix, strna(personality_to_string(c->personality)));
6074
6075 fprintf(f,
6076 "%sLockPersonality: %s\n",
6077 prefix, yes_no(c->lock_personality));
6078
6079 if (c->syscall_filter) {
6080 fprintf(f,
6081 "%sSystemCallFilter: ",
6082 prefix);
6083
6084 if (!c->syscall_allow_list)
6085 fputc('~', f);
6086
6087 #if HAVE_SECCOMP
6088 void *id, *val;
6089 bool first = true;
6090 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6091 _cleanup_free_ char *name = NULL;
6092 const char *errno_name = NULL;
6093 int num = PTR_TO_INT(val);
6094
6095 if (first)
6096 first = false;
6097 else
6098 fputc(' ', f);
6099
6100 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6101 fputs(strna(name), f);
6102
6103 if (num >= 0) {
6104 errno_name = seccomp_errno_or_action_to_string(num);
6105 if (errno_name)
6106 fprintf(f, ":%s", errno_name);
6107 else
6108 fprintf(f, ":%d", num);
6109 }
6110 }
6111 #endif
6112
6113 fputc('\n', f);
6114 }
6115
6116 if (c->syscall_archs) {
6117 fprintf(f,
6118 "%sSystemCallArchitectures:",
6119 prefix);
6120
6121 #if HAVE_SECCOMP
6122 void *id;
6123 SET_FOREACH(id, c->syscall_archs)
6124 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6125 #endif
6126 fputc('\n', f);
6127 }
6128
6129 if (exec_context_restrict_namespaces_set(c)) {
6130 _cleanup_free_ char *s = NULL;
6131
6132 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6133 if (r >= 0)
6134 fprintf(f, "%sRestrictNamespaces: %s\n",
6135 prefix, strna(s));
6136 }
6137
6138 #if HAVE_LIBBPF
6139 if (exec_context_restrict_filesystems_set(c)) {
6140 char *fs;
6141 SET_FOREACH(fs, c->restrict_filesystems)
6142 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6143 }
6144 #endif
6145
6146 if (c->network_namespace_path)
6147 fprintf(f,
6148 "%sNetworkNamespacePath: %s\n",
6149 prefix, c->network_namespace_path);
6150
6151 if (c->syscall_errno > 0) {
6152 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6153
6154 #if HAVE_SECCOMP
6155 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6156 if (errno_name)
6157 fputs(errno_name, f);
6158 else
6159 fprintf(f, "%d", c->syscall_errno);
6160 #endif
6161 fputc('\n', f);
6162 }
6163
6164 for (size_t i = 0; i < c->n_mount_images; i++) {
6165 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6166 c->mount_images[i].ignore_enoent ? "-": "",
6167 c->mount_images[i].source,
6168 c->mount_images[i].destination);
6169 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6170 fprintf(f, ":%s:%s",
6171 partition_designator_to_string(o->partition_designator),
6172 strempty(o->options));
6173 fprintf(f, "\n");
6174 }
6175
6176 for (size_t i = 0; i < c->n_extension_images; i++) {
6177 fprintf(f, "%sExtensionImages: %s%s", prefix,
6178 c->extension_images[i].ignore_enoent ? "-": "",
6179 c->extension_images[i].source);
6180 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6181 fprintf(f, ":%s:%s",
6182 partition_designator_to_string(o->partition_designator),
6183 strempty(o->options));
6184 fprintf(f, "\n");
6185 }
6186
6187 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6188 }
6189
6190 bool exec_context_maintains_privileges(const ExecContext *c) {
6191 assert(c);
6192
6193 /* Returns true if the process forked off would run under
6194 * an unchanged UID or as root. */
6195
6196 if (!c->user)
6197 return true;
6198
6199 if (streq(c->user, "root") || streq(c->user, "0"))
6200 return true;
6201
6202 return false;
6203 }
6204
6205 int exec_context_get_effective_ioprio(const ExecContext *c) {
6206 int p;
6207
6208 assert(c);
6209
6210 if (c->ioprio_set)
6211 return c->ioprio;
6212
6213 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6214 if (p < 0)
6215 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6216
6217 return ioprio_normalize(p);
6218 }
6219
6220 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6221 assert(c);
6222
6223 /* Explicit setting wins */
6224 if (c->mount_apivfs_set)
6225 return c->mount_apivfs;
6226
6227 /* Default to "yes" if root directory or image are specified */
6228 if (exec_context_with_rootfs(c))
6229 return true;
6230
6231 return false;
6232 }
6233
6234 void exec_context_free_log_extra_fields(ExecContext *c) {
6235 assert(c);
6236
6237 for (size_t l = 0; l < c->n_log_extra_fields; l++)
6238 free(c->log_extra_fields[l].iov_base);
6239 c->log_extra_fields = mfree(c->log_extra_fields);
6240 c->n_log_extra_fields = 0;
6241 }
6242
6243 void exec_context_revert_tty(ExecContext *c) {
6244 _cleanup_close_ int fd = -EBADF;
6245 const char *path;
6246 struct stat st;
6247 int r;
6248
6249 assert(c);
6250
6251 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6252 exec_context_tty_reset(c, NULL);
6253
6254 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6255 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6256 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6257 if (!exec_context_may_touch_tty(c))
6258 return;
6259
6260 path = exec_context_tty_path(c);
6261 if (!path)
6262 return;
6263
6264 fd = open(path, O_PATH|O_CLOEXEC);
6265 if (fd < 0)
6266 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6267 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6268 path);
6269
6270 if (fstat(fd, &st) < 0)
6271 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6272
6273 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6274 * if things are a character device, since a proper check either means we'd have to open the TTY and
6275 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6276 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6277 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6278 if (!S_ISCHR(st.st_mode))
6279 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6280
6281 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6282 if (r < 0)
6283 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6284 }
6285
6286 int exec_context_get_clean_directories(
6287 ExecContext *c,
6288 char **prefix,
6289 ExecCleanMask mask,
6290 char ***ret) {
6291
6292 _cleanup_strv_free_ char **l = NULL;
6293 int r;
6294
6295 assert(c);
6296 assert(prefix);
6297 assert(ret);
6298
6299 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6300 if (!FLAGS_SET(mask, 1U << t))
6301 continue;
6302
6303 if (!prefix[t])
6304 continue;
6305
6306 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6307 char *j;
6308
6309 j = path_join(prefix[t], c->directories[t].items[i].path);
6310 if (!j)
6311 return -ENOMEM;
6312
6313 r = strv_consume(&l, j);
6314 if (r < 0)
6315 return r;
6316
6317 /* Also remove private directories unconditionally. */
6318 if (t != EXEC_DIRECTORY_CONFIGURATION) {
6319 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6320 if (!j)
6321 return -ENOMEM;
6322
6323 r = strv_consume(&l, j);
6324 if (r < 0)
6325 return r;
6326 }
6327
6328 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6329 j = path_join(prefix[t], *symlink);
6330 if (!j)
6331 return -ENOMEM;
6332
6333 r = strv_consume(&l, j);
6334 if (r < 0)
6335 return r;
6336 }
6337 }
6338 }
6339
6340 *ret = TAKE_PTR(l);
6341 return 0;
6342 }
6343
6344 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6345 ExecCleanMask mask = 0;
6346
6347 assert(c);
6348 assert(ret);
6349
6350 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6351 if (c->directories[t].n_items > 0)
6352 mask |= 1U << t;
6353
6354 *ret = mask;
6355 return 0;
6356 }
6357
6358 void exec_status_start(ExecStatus *s, pid_t pid) {
6359 assert(s);
6360
6361 *s = (ExecStatus) {
6362 .pid = pid,
6363 };
6364
6365 dual_timestamp_get(&s->start_timestamp);
6366 }
6367
6368 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6369 assert(s);
6370
6371 if (s->pid != pid)
6372 *s = (ExecStatus) {
6373 .pid = pid,
6374 };
6375
6376 dual_timestamp_get(&s->exit_timestamp);
6377
6378 s->code = code;
6379 s->status = status;
6380
6381 if (context && context->utmp_id)
6382 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6383 }
6384
6385 void exec_status_reset(ExecStatus *s) {
6386 assert(s);
6387
6388 *s = (ExecStatus) {};
6389 }
6390
6391 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6392 assert(s);
6393 assert(f);
6394
6395 if (s->pid <= 0)
6396 return;
6397
6398 prefix = strempty(prefix);
6399
6400 fprintf(f,
6401 "%sPID: "PID_FMT"\n",
6402 prefix, s->pid);
6403
6404 if (dual_timestamp_is_set(&s->start_timestamp))
6405 fprintf(f,
6406 "%sStart Timestamp: %s\n",
6407 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6408
6409 if (dual_timestamp_is_set(&s->exit_timestamp))
6410 fprintf(f,
6411 "%sExit Timestamp: %s\n"
6412 "%sExit Code: %s\n"
6413 "%sExit Status: %i\n",
6414 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6415 prefix, sigchld_code_to_string(s->code),
6416 prefix, s->status);
6417 }
6418
6419 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6420 _cleanup_free_ char *cmd = NULL;
6421 const char *prefix2;
6422
6423 assert(c);
6424 assert(f);
6425
6426 prefix = strempty(prefix);
6427 prefix2 = strjoina(prefix, "\t");
6428
6429 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6430
6431 fprintf(f,
6432 "%sCommand Line: %s\n",
6433 prefix, strnull(cmd));
6434
6435 exec_status_dump(&c->exec_status, f, prefix2);
6436 }
6437
6438 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6439 assert(f);
6440
6441 prefix = strempty(prefix);
6442
6443 LIST_FOREACH(command, i, c)
6444 exec_command_dump(i, f, prefix);
6445 }
6446
6447 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6448 ExecCommand *end;
6449
6450 assert(l);
6451 assert(e);
6452
6453 if (*l) {
6454 /* It's kind of important, that we keep the order here */
6455 end = LIST_FIND_TAIL(command, *l);
6456 LIST_INSERT_AFTER(command, *l, end, e);
6457 } else
6458 *l = e;
6459 }
6460
6461 int exec_command_set(ExecCommand *c, const char *path, ...) {
6462 va_list ap;
6463 char **l, *p;
6464
6465 assert(c);
6466 assert(path);
6467
6468 va_start(ap, path);
6469 l = strv_new_ap(path, ap);
6470 va_end(ap);
6471
6472 if (!l)
6473 return -ENOMEM;
6474
6475 p = strdup(path);
6476 if (!p) {
6477 strv_free(l);
6478 return -ENOMEM;
6479 }
6480
6481 free_and_replace(c->path, p);
6482
6483 return strv_free_and_replace(c->argv, l);
6484 }
6485
6486 int exec_command_append(ExecCommand *c, const char *path, ...) {
6487 _cleanup_strv_free_ char **l = NULL;
6488 va_list ap;
6489 int r;
6490
6491 assert(c);
6492 assert(path);
6493
6494 va_start(ap, path);
6495 l = strv_new_ap(path, ap);
6496 va_end(ap);
6497
6498 if (!l)
6499 return -ENOMEM;
6500
6501 r = strv_extend_strv(&c->argv, l, false);
6502 if (r < 0)
6503 return r;
6504
6505 return 0;
6506 }
6507
6508 static char *destroy_tree(char *path) {
6509 if (!path)
6510 return NULL;
6511
6512 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6513 log_debug("Spawning process to nuke '%s'", path);
6514
6515 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6516 }
6517
6518 return mfree(path);
6519 }
6520
6521 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
6522 if (!rt)
6523 return NULL;
6524
6525 if (rt->manager)
6526 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
6527
6528 rt->id = mfree(rt->id);
6529 rt->tmp_dir = mfree(rt->tmp_dir);
6530 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6531 safe_close_pair(rt->netns_storage_socket);
6532 safe_close_pair(rt->ipcns_storage_socket);
6533 return mfree(rt);
6534 }
6535
6536 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6537 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6538
6539 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
6540 if (!rt)
6541 return NULL;
6542
6543 assert(rt->n_ref > 0);
6544 rt->n_ref--;
6545
6546 if (rt->n_ref > 0)
6547 return NULL;
6548
6549 rt->tmp_dir = destroy_tree(rt->tmp_dir);
6550 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
6551
6552 return exec_shared_runtime_free(rt);
6553 }
6554
6555 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
6556 _cleanup_free_ char *id_copy = NULL;
6557 ExecSharedRuntime *n;
6558
6559 assert(ret);
6560
6561 id_copy = strdup(id);
6562 if (!id_copy)
6563 return -ENOMEM;
6564
6565 n = new(ExecSharedRuntime, 1);
6566 if (!n)
6567 return -ENOMEM;
6568
6569 *n = (ExecSharedRuntime) {
6570 .id = TAKE_PTR(id_copy),
6571 .netns_storage_socket = PIPE_EBADF,
6572 .ipcns_storage_socket = PIPE_EBADF,
6573 };
6574
6575 *ret = n;
6576 return 0;
6577 }
6578
6579 static int exec_shared_runtime_add(
6580 Manager *m,
6581 const char *id,
6582 char **tmp_dir,
6583 char **var_tmp_dir,
6584 int netns_storage_socket[2],
6585 int ipcns_storage_socket[2],
6586 ExecSharedRuntime **ret) {
6587
6588 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
6589 int r;
6590
6591 assert(m);
6592 assert(id);
6593
6594 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6595
6596 r = exec_shared_runtime_allocate(&rt, id);
6597 if (r < 0)
6598 return r;
6599
6600 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
6601 if (r < 0)
6602 return r;
6603
6604 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6605 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6606 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6607
6608 if (netns_storage_socket) {
6609 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6610 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6611 }
6612
6613 if (ipcns_storage_socket) {
6614 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6615 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6616 }
6617
6618 rt->manager = m;
6619
6620 if (ret)
6621 *ret = rt;
6622 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
6623 TAKE_PTR(rt);
6624 return 0;
6625 }
6626
6627 static int exec_shared_runtime_make(
6628 Manager *m,
6629 const ExecContext *c,
6630 const char *id,
6631 ExecSharedRuntime **ret) {
6632
6633 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6634 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
6635 int r;
6636
6637 assert(m);
6638 assert(c);
6639 assert(id);
6640
6641 /* It is not necessary to create ExecSharedRuntime object. */
6642 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
6643 *ret = NULL;
6644 return 0;
6645 }
6646
6647 if (c->private_tmp &&
6648 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6649 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6650 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6651 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6652 if (r < 0)
6653 return r;
6654 }
6655
6656 if (exec_needs_network_namespace(c)) {
6657 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6658 return -errno;
6659 }
6660
6661 if (exec_needs_ipc_namespace(c)) {
6662 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6663 return -errno;
6664 }
6665
6666 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6667 if (r < 0)
6668 return r;
6669
6670 return 1;
6671 }
6672
6673 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6674 ExecSharedRuntime *rt;
6675 int r;
6676
6677 assert(m);
6678 assert(id);
6679 assert(ret);
6680
6681 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
6682 if (rt)
6683 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
6684 goto ref;
6685
6686 if (!create) {
6687 *ret = NULL;
6688 return 0;
6689 }
6690
6691 /* If not found, then create a new object. */
6692 r = exec_shared_runtime_make(m, c, id, &rt);
6693 if (r < 0)
6694 return r;
6695 if (r == 0) {
6696 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
6697 *ret = NULL;
6698 return 0;
6699 }
6700
6701 ref:
6702 /* increment reference counter. */
6703 rt->n_ref++;
6704 *ret = rt;
6705 return 1;
6706 }
6707
6708 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6709 ExecSharedRuntime *rt;
6710
6711 assert(m);
6712 assert(f);
6713 assert(fds);
6714
6715 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6716 fprintf(f, "exec-runtime=%s", rt->id);
6717
6718 if (rt->tmp_dir)
6719 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6720
6721 if (rt->var_tmp_dir)
6722 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6723
6724 if (rt->netns_storage_socket[0] >= 0) {
6725 int copy;
6726
6727 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6728 if (copy < 0)
6729 return copy;
6730
6731 fprintf(f, " netns-socket-0=%i", copy);
6732 }
6733
6734 if (rt->netns_storage_socket[1] >= 0) {
6735 int copy;
6736
6737 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6738 if (copy < 0)
6739 return copy;
6740
6741 fprintf(f, " netns-socket-1=%i", copy);
6742 }
6743
6744 if (rt->ipcns_storage_socket[0] >= 0) {
6745 int copy;
6746
6747 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6748 if (copy < 0)
6749 return copy;
6750
6751 fprintf(f, " ipcns-socket-0=%i", copy);
6752 }
6753
6754 if (rt->ipcns_storage_socket[1] >= 0) {
6755 int copy;
6756
6757 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6758 if (copy < 0)
6759 return copy;
6760
6761 fprintf(f, " ipcns-socket-1=%i", copy);
6762 }
6763
6764 fputc('\n', f);
6765 }
6766
6767 return 0;
6768 }
6769
6770 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6771 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
6772 ExecSharedRuntime *rt;
6773 int r;
6774
6775 /* This is for the migration from old (v237 or earlier) deserialization text.
6776 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6777 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
6778 * so or not from the serialized text, then we always creates a new object owned by this. */
6779
6780 assert(u);
6781 assert(key);
6782 assert(value);
6783
6784 /* Manager manages ExecSharedRuntime objects by the unit id.
6785 * So, we omit the serialized text when the unit does not have id (yet?)... */
6786 if (isempty(u->id)) {
6787 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6788 return 0;
6789 }
6790
6791 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
6792 return log_oom();
6793
6794 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
6795 if (!rt) {
6796 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
6797 return log_oom();
6798
6799 rt = rt_create;
6800 }
6801
6802 if (streq(key, "tmp-dir")) {
6803 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6804 return -ENOMEM;
6805
6806 } else if (streq(key, "var-tmp-dir")) {
6807 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6808 return -ENOMEM;
6809
6810 } else if (streq(key, "netns-socket-0")) {
6811 int fd;
6812
6813 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6814 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6815 return 0;
6816 }
6817
6818 safe_close(rt->netns_storage_socket[0]);
6819 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6820
6821 } else if (streq(key, "netns-socket-1")) {
6822 int fd;
6823
6824 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6825 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6826 return 0;
6827 }
6828
6829 safe_close(rt->netns_storage_socket[1]);
6830 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6831
6832 } else
6833 return 0;
6834
6835 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
6836 if (rt_create) {
6837 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
6838 if (r < 0) {
6839 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6840 return 0;
6841 }
6842
6843 rt_create->manager = u->manager;
6844
6845 /* Avoid cleanup */
6846 TAKE_PTR(rt_create);
6847 }
6848
6849 return 1;
6850 }
6851
6852 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6853 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6854 char *id = NULL;
6855 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6856 const char *p, *v = ASSERT_PTR(value);
6857 size_t n;
6858
6859 assert(m);
6860 assert(fds);
6861
6862 n = strcspn(v, " ");
6863 id = strndupa_safe(v, n);
6864 if (v[n] != ' ')
6865 goto finalize;
6866 p = v + n + 1;
6867
6868 v = startswith(p, "tmp-dir=");
6869 if (v) {
6870 n = strcspn(v, " ");
6871 tmp_dir = strndup(v, n);
6872 if (!tmp_dir)
6873 return log_oom();
6874 if (v[n] != ' ')
6875 goto finalize;
6876 p = v + n + 1;
6877 }
6878
6879 v = startswith(p, "var-tmp-dir=");
6880 if (v) {
6881 n = strcspn(v, " ");
6882 var_tmp_dir = strndup(v, n);
6883 if (!var_tmp_dir)
6884 return log_oom();
6885 if (v[n] != ' ')
6886 goto finalize;
6887 p = v + n + 1;
6888 }
6889
6890 v = startswith(p, "netns-socket-0=");
6891 if (v) {
6892 char *buf;
6893
6894 n = strcspn(v, " ");
6895 buf = strndupa_safe(v, n);
6896
6897 netns_fdpair[0] = parse_fd(buf);
6898 if (netns_fdpair[0] < 0)
6899 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6900 if (!fdset_contains(fds, netns_fdpair[0]))
6901 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6902 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6903 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6904 if (v[n] != ' ')
6905 goto finalize;
6906 p = v + n + 1;
6907 }
6908
6909 v = startswith(p, "netns-socket-1=");
6910 if (v) {
6911 char *buf;
6912
6913 n = strcspn(v, " ");
6914 buf = strndupa_safe(v, n);
6915
6916 netns_fdpair[1] = parse_fd(buf);
6917 if (netns_fdpair[1] < 0)
6918 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6919 if (!fdset_contains(fds, netns_fdpair[1]))
6920 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6921 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6922 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6923 if (v[n] != ' ')
6924 goto finalize;
6925 p = v + n + 1;
6926 }
6927
6928 v = startswith(p, "ipcns-socket-0=");
6929 if (v) {
6930 char *buf;
6931
6932 n = strcspn(v, " ");
6933 buf = strndupa_safe(v, n);
6934
6935 ipcns_fdpair[0] = parse_fd(buf);
6936 if (ipcns_fdpair[0] < 0)
6937 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6938 if (!fdset_contains(fds, ipcns_fdpair[0]))
6939 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6940 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6941 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6942 if (v[n] != ' ')
6943 goto finalize;
6944 p = v + n + 1;
6945 }
6946
6947 v = startswith(p, "ipcns-socket-1=");
6948 if (v) {
6949 char *buf;
6950
6951 n = strcspn(v, " ");
6952 buf = strndupa_safe(v, n);
6953
6954 ipcns_fdpair[1] = parse_fd(buf);
6955 if (ipcns_fdpair[1] < 0)
6956 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6957 if (!fdset_contains(fds, ipcns_fdpair[1]))
6958 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6959 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6960 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6961 }
6962
6963 finalize:
6964 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6965 if (r < 0)
6966 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6967 return 0;
6968 }
6969
6970 void exec_shared_runtime_vacuum(Manager *m) {
6971 ExecSharedRuntime *rt;
6972
6973 assert(m);
6974
6975 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
6976
6977 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6978 if (rt->n_ref > 0)
6979 continue;
6980
6981 (void) exec_shared_runtime_free(rt);
6982 }
6983 }
6984
6985 int exec_runtime_make(
6986 const Unit *unit,
6987 const ExecContext *context,
6988 ExecSharedRuntime *shared,
6989 DynamicCreds *creds,
6990 ExecRuntime **ret) {
6991 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
6992 _cleanup_free_ char *ephemeral = NULL;
6993 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6994 int r;
6995
6996 assert(unit);
6997 assert(context);
6998 assert(ret);
6999
7000 if (!shared && !creds && !exec_needs_ephemeral(context)) {
7001 *ret = NULL;
7002 return 0;
7003 }
7004
7005 if (exec_needs_ephemeral(context)) {
7006 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7007 if (r < 0)
7008 return r;
7009
7010 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7011 if (r < 0)
7012 return r;
7013
7014 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7015 return -errno;
7016 }
7017
7018 rt = new(ExecRuntime, 1);
7019 if (!rt)
7020 return -ENOMEM;
7021
7022 *rt = (ExecRuntime) {
7023 .shared = shared,
7024 .dynamic_creds = creds,
7025 .ephemeral_copy = TAKE_PTR(ephemeral),
7026 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7027 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7028 };
7029
7030 *ret = TAKE_PTR(rt);
7031 return 1;
7032 }
7033
7034 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7035 if (!rt)
7036 return NULL;
7037
7038 exec_shared_runtime_unref(rt->shared);
7039 dynamic_creds_unref(rt->dynamic_creds);
7040
7041 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7042
7043 safe_close_pair(rt->ephemeral_storage_socket);
7044 return mfree(rt);
7045 }
7046
7047 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7048 if (!rt)
7049 return NULL;
7050
7051 rt->shared = exec_shared_runtime_destroy(rt->shared);
7052 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7053 return exec_runtime_free(rt);
7054 }
7055
7056 void exec_params_clear(ExecParameters *p) {
7057 if (!p)
7058 return;
7059
7060 p->environment = strv_free(p->environment);
7061 p->fd_names = strv_free(p->fd_names);
7062 p->fds = mfree(p->fds);
7063 p->exec_fd = safe_close(p->exec_fd);
7064 }
7065
7066 void exec_directory_done(ExecDirectory *d) {
7067 if (!d)
7068 return;
7069
7070 for (size_t i = 0; i < d->n_items; i++) {
7071 free(d->items[i].path);
7072 strv_free(d->items[i].symlinks);
7073 }
7074
7075 d->items = mfree(d->items);
7076 d->n_items = 0;
7077 d->mode = 0755;
7078 }
7079
7080 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7081 assert(d);
7082 assert(path);
7083
7084 for (size_t i = 0; i < d->n_items; i++)
7085 if (path_equal(d->items[i].path, path))
7086 return &d->items[i];
7087
7088 return NULL;
7089 }
7090
7091 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7092 _cleanup_strv_free_ char **s = NULL;
7093 _cleanup_free_ char *p = NULL;
7094 ExecDirectoryItem *existing;
7095 int r;
7096
7097 assert(d);
7098 assert(path);
7099
7100 existing = exec_directory_find(d, path);
7101 if (existing) {
7102 r = strv_extend(&existing->symlinks, symlink);
7103 if (r < 0)
7104 return r;
7105
7106 return 0; /* existing item is updated */
7107 }
7108
7109 p = strdup(path);
7110 if (!p)
7111 return -ENOMEM;
7112
7113 if (symlink) {
7114 s = strv_new(symlink);
7115 if (!s)
7116 return -ENOMEM;
7117 }
7118
7119 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7120 return -ENOMEM;
7121
7122 d->items[d->n_items++] = (ExecDirectoryItem) {
7123 .path = TAKE_PTR(p),
7124 .symlinks = TAKE_PTR(s),
7125 };
7126
7127 return 1; /* new item is added */
7128 }
7129
7130 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7131 assert(a);
7132 assert(b);
7133
7134 return path_compare(a->path, b->path);
7135 }
7136
7137 void exec_directory_sort(ExecDirectory *d) {
7138 assert(d);
7139
7140 /* Sort the exec directories to make always parent directories processed at first in
7141 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7142 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7143 * list. See also comments in setup_exec_directory() and issue #24783. */
7144
7145 if (d->n_items <= 1)
7146 return;
7147
7148 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7149
7150 for (size_t i = 1; i < d->n_items; i++)
7151 for (size_t j = 0; j < i; j++)
7152 if (path_startswith(d->items[i].path, d->items[j].path)) {
7153 d->items[i].only_create = true;
7154 break;
7155 }
7156 }
7157
7158 ExecCleanMask exec_clean_mask_from_string(const char *s) {
7159 ExecDirectoryType t;
7160
7161 assert(s);
7162
7163 if (streq(s, "all"))
7164 return EXEC_CLEAN_ALL;
7165 if (streq(s, "fdstore"))
7166 return EXEC_CLEAN_FDSTORE;
7167
7168 t = exec_resource_type_from_string(s);
7169 if (t < 0)
7170 return (ExecCleanMask) t;
7171
7172 return 1U << t;
7173 }
7174
7175 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7176 [EXEC_INPUT_NULL] = "null",
7177 [EXEC_INPUT_TTY] = "tty",
7178 [EXEC_INPUT_TTY_FORCE] = "tty-force",
7179 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7180 [EXEC_INPUT_SOCKET] = "socket",
7181 [EXEC_INPUT_NAMED_FD] = "fd",
7182 [EXEC_INPUT_DATA] = "data",
7183 [EXEC_INPUT_FILE] = "file",
7184 };
7185
7186 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7187
7188 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7189 [EXEC_OUTPUT_INHERIT] = "inherit",
7190 [EXEC_OUTPUT_NULL] = "null",
7191 [EXEC_OUTPUT_TTY] = "tty",
7192 [EXEC_OUTPUT_KMSG] = "kmsg",
7193 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7194 [EXEC_OUTPUT_JOURNAL] = "journal",
7195 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7196 [EXEC_OUTPUT_SOCKET] = "socket",
7197 [EXEC_OUTPUT_NAMED_FD] = "fd",
7198 [EXEC_OUTPUT_FILE] = "file",
7199 [EXEC_OUTPUT_FILE_APPEND] = "append",
7200 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7201 };
7202
7203 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7204
7205 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7206 [EXEC_UTMP_INIT] = "init",
7207 [EXEC_UTMP_LOGIN] = "login",
7208 [EXEC_UTMP_USER] = "user",
7209 };
7210
7211 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7212
7213 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7214 [EXEC_PRESERVE_NO] = "no",
7215 [EXEC_PRESERVE_YES] = "yes",
7216 [EXEC_PRESERVE_RESTART] = "restart",
7217 };
7218
7219 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7220
7221 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7222 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7223 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7224 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7225 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7226 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7227 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7228 };
7229
7230 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7231
7232 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7233 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7234 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7235 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7236 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7237 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7238 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7239 };
7240
7241 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7242
7243 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7244 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7245 * directories, specifically .timer units with their timestamp touch file. */
7246 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7247 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7248 [EXEC_DIRECTORY_STATE] = "state",
7249 [EXEC_DIRECTORY_CACHE] = "cache",
7250 [EXEC_DIRECTORY_LOGS] = "logs",
7251 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7252 };
7253
7254 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7255
7256 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7257 * the service payload in. */
7258 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7259 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7260 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7261 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7262 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7263 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7264 };
7265
7266 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7267
7268 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7269 [EXEC_KEYRING_INHERIT] = "inherit",
7270 [EXEC_KEYRING_PRIVATE] = "private",
7271 [EXEC_KEYRING_SHARED] = "shared",
7272 };
7273
7274 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);