]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
core: add [State|Runtime|Cache|Logs]Directory symlink as second parameter
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/mount.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #if HAVE_PAM
19 #include <security/pam_appl.h>
20 #endif
21
22 #if HAVE_SELINUX
23 #include <selinux/selinux.h>
24 #endif
25
26 #if HAVE_SECCOMP
27 #include <seccomp.h>
28 #endif
29
30 #if HAVE_APPARMOR
31 #include <sys/apparmor.h>
32 #endif
33
34 #include "sd-messages.h"
35
36 #include "acl-util.h"
37 #include "af-list.h"
38 #include "alloc-util.h"
39 #if HAVE_APPARMOR
40 #include "apparmor-util.h"
41 #endif
42 #include "async.h"
43 #include "barrier.h"
44 #include "bpf-lsm.h"
45 #include "cap-list.h"
46 #include "capability-util.h"
47 #include "cgroup-setup.h"
48 #include "chase-symlinks.h"
49 #include "chown-recursive.h"
50 #include "cpu-set-util.h"
51 #include "creds-util.h"
52 #include "data-fd-util.h"
53 #include "def.h"
54 #include "env-file.h"
55 #include "env-util.h"
56 #include "errno-list.h"
57 #include "escape.h"
58 #include "execute.h"
59 #include "exit-status.h"
60 #include "fd-util.h"
61 #include "fileio.h"
62 #include "format-util.h"
63 #include "glob-util.h"
64 #include "hexdecoct.h"
65 #include "io-util.h"
66 #include "label.h"
67 #include "log.h"
68 #include "macro.h"
69 #include "manager.h"
70 #include "manager-dump.h"
71 #include "memory-util.h"
72 #include "missing_fs.h"
73 #include "missing_ioprio.h"
74 #include "mkdir.h"
75 #include "mount-util.h"
76 #include "mountpoint-util.h"
77 #include "namespace.h"
78 #include "parse-util.h"
79 #include "path-util.h"
80 #include "process-util.h"
81 #include "random-util.h"
82 #include "rlimit-util.h"
83 #include "rm-rf.h"
84 #if HAVE_SECCOMP
85 #include "seccomp-util.h"
86 #endif
87 #include "securebits-util.h"
88 #include "selinux-util.h"
89 #include "signal-util.h"
90 #include "smack-util.h"
91 #include "socket-util.h"
92 #include "special.h"
93 #include "stat-util.h"
94 #include "string-table.h"
95 #include "string-util.h"
96 #include "strv.h"
97 #include "syslog-util.h"
98 #include "terminal-util.h"
99 #include "tmpfile-util.h"
100 #include "umask-util.h"
101 #include "unit-serialize.h"
102 #include "user-util.h"
103 #include "utmp-wtmp.h"
104
105 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
106 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
107
108 #define SNDBUF_SIZE (8*1024*1024)
109
110 static int shift_fds(int fds[], size_t n_fds) {
111 if (n_fds <= 0)
112 return 0;
113
114 /* Modifies the fds array! (sorts it) */
115
116 assert(fds);
117
118 for (int start = 0;;) {
119 int restart_from = -1;
120
121 for (int i = start; i < (int) n_fds; i++) {
122 int nfd;
123
124 /* Already at right index? */
125 if (fds[i] == i+3)
126 continue;
127
128 nfd = fcntl(fds[i], F_DUPFD, i + 3);
129 if (nfd < 0)
130 return -errno;
131
132 safe_close(fds[i]);
133 fds[i] = nfd;
134
135 /* Hmm, the fd we wanted isn't free? Then
136 * let's remember that and try again from here */
137 if (nfd != i+3 && restart_from < 0)
138 restart_from = i;
139 }
140
141 if (restart_from < 0)
142 break;
143
144 start = restart_from;
145 }
146
147 return 0;
148 }
149
150 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
151 size_t n_fds;
152 int r;
153
154 n_fds = n_socket_fds + n_storage_fds;
155 if (n_fds <= 0)
156 return 0;
157
158 assert(fds);
159
160 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
161 * O_NONBLOCK only applies to socket activation though. */
162
163 for (size_t i = 0; i < n_fds; i++) {
164
165 if (i < n_socket_fds) {
166 r = fd_nonblock(fds[i], nonblock);
167 if (r < 0)
168 return r;
169 }
170
171 /* We unconditionally drop FD_CLOEXEC from the fds,
172 * since after all we want to pass these fds to our
173 * children */
174
175 r = fd_cloexec(fds[i], false);
176 if (r < 0)
177 return r;
178 }
179
180 return 0;
181 }
182
183 static const char *exec_context_tty_path(const ExecContext *context) {
184 assert(context);
185
186 if (context->stdio_as_fds)
187 return NULL;
188
189 if (context->tty_path)
190 return context->tty_path;
191
192 return "/dev/console";
193 }
194
195 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
196 const char *path;
197
198 assert(context);
199
200 path = exec_context_tty_path(context);
201
202 if (context->tty_vhangup) {
203 if (p && p->stdin_fd >= 0)
204 (void) terminal_vhangup_fd(p->stdin_fd);
205 else if (path)
206 (void) terminal_vhangup(path);
207 }
208
209 if (context->tty_reset) {
210 if (p && p->stdin_fd >= 0)
211 (void) reset_terminal_fd(p->stdin_fd, true);
212 else if (path)
213 (void) reset_terminal(path);
214 }
215
216 if (context->tty_vt_disallocate && path)
217 (void) vt_disallocate(path);
218 }
219
220 static bool is_terminal_input(ExecInput i) {
221 return IN_SET(i,
222 EXEC_INPUT_TTY,
223 EXEC_INPUT_TTY_FORCE,
224 EXEC_INPUT_TTY_FAIL);
225 }
226
227 static bool is_terminal_output(ExecOutput o) {
228 return IN_SET(o,
229 EXEC_OUTPUT_TTY,
230 EXEC_OUTPUT_KMSG_AND_CONSOLE,
231 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
232 }
233
234 static bool is_kmsg_output(ExecOutput o) {
235 return IN_SET(o,
236 EXEC_OUTPUT_KMSG,
237 EXEC_OUTPUT_KMSG_AND_CONSOLE);
238 }
239
240 static bool exec_context_needs_term(const ExecContext *c) {
241 assert(c);
242
243 /* Return true if the execution context suggests we should set $TERM to something useful. */
244
245 if (is_terminal_input(c->std_input))
246 return true;
247
248 if (is_terminal_output(c->std_output))
249 return true;
250
251 if (is_terminal_output(c->std_error))
252 return true;
253
254 return !!c->tty_path;
255 }
256
257 static int open_null_as(int flags, int nfd) {
258 int fd;
259
260 assert(nfd >= 0);
261
262 fd = open("/dev/null", flags|O_NOCTTY);
263 if (fd < 0)
264 return -errno;
265
266 return move_fd(fd, nfd, false);
267 }
268
269 static int connect_journal_socket(
270 int fd,
271 const char *log_namespace,
272 uid_t uid,
273 gid_t gid) {
274
275 union sockaddr_union sa;
276 socklen_t sa_len;
277 uid_t olduid = UID_INVALID;
278 gid_t oldgid = GID_INVALID;
279 const char *j;
280 int r;
281
282 j = log_namespace ?
283 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
284 "/run/systemd/journal/stdout";
285 r = sockaddr_un_set_path(&sa.un, j);
286 if (r < 0)
287 return r;
288 sa_len = r;
289
290 if (gid_is_valid(gid)) {
291 oldgid = getgid();
292
293 if (setegid(gid) < 0)
294 return -errno;
295 }
296
297 if (uid_is_valid(uid)) {
298 olduid = getuid();
299
300 if (seteuid(uid) < 0) {
301 r = -errno;
302 goto restore_gid;
303 }
304 }
305
306 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
307
308 /* If we fail to restore the uid or gid, things will likely
309 fail later on. This should only happen if an LSM interferes. */
310
311 if (uid_is_valid(uid))
312 (void) seteuid(olduid);
313
314 restore_gid:
315 if (gid_is_valid(gid))
316 (void) setegid(oldgid);
317
318 return r;
319 }
320
321 static int connect_logger_as(
322 const Unit *unit,
323 const ExecContext *context,
324 const ExecParameters *params,
325 ExecOutput output,
326 const char *ident,
327 int nfd,
328 uid_t uid,
329 gid_t gid) {
330
331 _cleanup_close_ int fd = -1;
332 int r;
333
334 assert(context);
335 assert(params);
336 assert(output < _EXEC_OUTPUT_MAX);
337 assert(ident);
338 assert(nfd >= 0);
339
340 fd = socket(AF_UNIX, SOCK_STREAM, 0);
341 if (fd < 0)
342 return -errno;
343
344 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
345 if (r < 0)
346 return r;
347
348 if (shutdown(fd, SHUT_RD) < 0)
349 return -errno;
350
351 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
352
353 if (dprintf(fd,
354 "%s\n"
355 "%s\n"
356 "%i\n"
357 "%i\n"
358 "%i\n"
359 "%i\n"
360 "%i\n",
361 context->syslog_identifier ?: ident,
362 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
363 context->syslog_priority,
364 !!context->syslog_level_prefix,
365 false,
366 is_kmsg_output(output),
367 is_terminal_output(output)) < 0)
368 return -errno;
369
370 return move_fd(TAKE_FD(fd), nfd, false);
371 }
372
373 static int open_terminal_as(const char *path, int flags, int nfd) {
374 int fd;
375
376 assert(path);
377 assert(nfd >= 0);
378
379 fd = open_terminal(path, flags | O_NOCTTY);
380 if (fd < 0)
381 return fd;
382
383 return move_fd(fd, nfd, false);
384 }
385
386 static int acquire_path(const char *path, int flags, mode_t mode) {
387 union sockaddr_union sa;
388 socklen_t sa_len;
389 _cleanup_close_ int fd = -1;
390 int r;
391
392 assert(path);
393
394 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
395 flags |= O_CREAT;
396
397 fd = open(path, flags|O_NOCTTY, mode);
398 if (fd >= 0)
399 return TAKE_FD(fd);
400
401 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
402 return -errno;
403
404 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
405
406 r = sockaddr_un_set_path(&sa.un, path);
407 if (r < 0)
408 return r == -EINVAL ? -ENXIO : r;
409 sa_len = r;
410
411 fd = socket(AF_UNIX, SOCK_STREAM, 0);
412 if (fd < 0)
413 return -errno;
414
415 if (connect(fd, &sa.sa, sa_len) < 0)
416 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
417 * indication that this wasn't an AF_UNIX socket after all */
418
419 if ((flags & O_ACCMODE) == O_RDONLY)
420 r = shutdown(fd, SHUT_WR);
421 else if ((flags & O_ACCMODE) == O_WRONLY)
422 r = shutdown(fd, SHUT_RD);
423 else
424 r = 0;
425 if (r < 0)
426 return -errno;
427
428 return TAKE_FD(fd);
429 }
430
431 static int fixup_input(
432 const ExecContext *context,
433 int socket_fd,
434 bool apply_tty_stdin) {
435
436 ExecInput std_input;
437
438 assert(context);
439
440 std_input = context->std_input;
441
442 if (is_terminal_input(std_input) && !apply_tty_stdin)
443 return EXEC_INPUT_NULL;
444
445 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
446 return EXEC_INPUT_NULL;
447
448 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
449 return EXEC_INPUT_NULL;
450
451 return std_input;
452 }
453
454 static int fixup_output(ExecOutput output, int socket_fd) {
455
456 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
457 return EXEC_OUTPUT_INHERIT;
458
459 return output;
460 }
461
462 static int setup_input(
463 const ExecContext *context,
464 const ExecParameters *params,
465 int socket_fd,
466 const int named_iofds[static 3]) {
467
468 ExecInput i;
469
470 assert(context);
471 assert(params);
472 assert(named_iofds);
473
474 if (params->stdin_fd >= 0) {
475 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
476 return -errno;
477
478 /* Try to make this the controlling tty, if it is a tty, and reset it */
479 if (isatty(STDIN_FILENO)) {
480 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
481 (void) reset_terminal_fd(STDIN_FILENO, true);
482 }
483
484 return STDIN_FILENO;
485 }
486
487 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
488
489 switch (i) {
490
491 case EXEC_INPUT_NULL:
492 return open_null_as(O_RDONLY, STDIN_FILENO);
493
494 case EXEC_INPUT_TTY:
495 case EXEC_INPUT_TTY_FORCE:
496 case EXEC_INPUT_TTY_FAIL: {
497 int fd;
498
499 fd = acquire_terminal(exec_context_tty_path(context),
500 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
501 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
502 ACQUIRE_TERMINAL_WAIT,
503 USEC_INFINITY);
504 if (fd < 0)
505 return fd;
506
507 return move_fd(fd, STDIN_FILENO, false);
508 }
509
510 case EXEC_INPUT_SOCKET:
511 assert(socket_fd >= 0);
512
513 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
514
515 case EXEC_INPUT_NAMED_FD:
516 assert(named_iofds[STDIN_FILENO] >= 0);
517
518 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
519 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
520
521 case EXEC_INPUT_DATA: {
522 int fd;
523
524 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
525 if (fd < 0)
526 return fd;
527
528 return move_fd(fd, STDIN_FILENO, false);
529 }
530
531 case EXEC_INPUT_FILE: {
532 bool rw;
533 int fd;
534
535 assert(context->stdio_file[STDIN_FILENO]);
536
537 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
538 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
539
540 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
541 if (fd < 0)
542 return fd;
543
544 return move_fd(fd, STDIN_FILENO, false);
545 }
546
547 default:
548 assert_not_reached();
549 }
550 }
551
552 static bool can_inherit_stderr_from_stdout(
553 const ExecContext *context,
554 ExecOutput o,
555 ExecOutput e) {
556
557 assert(context);
558
559 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
560 * stderr fd */
561
562 if (e == EXEC_OUTPUT_INHERIT)
563 return true;
564 if (e != o)
565 return false;
566
567 if (e == EXEC_OUTPUT_NAMED_FD)
568 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
569
570 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
571 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
572
573 return true;
574 }
575
576 static int setup_output(
577 const Unit *unit,
578 const ExecContext *context,
579 const ExecParameters *params,
580 int fileno,
581 int socket_fd,
582 const int named_iofds[static 3],
583 const char *ident,
584 uid_t uid,
585 gid_t gid,
586 dev_t *journal_stream_dev,
587 ino_t *journal_stream_ino) {
588
589 ExecOutput o;
590 ExecInput i;
591 int r;
592
593 assert(unit);
594 assert(context);
595 assert(params);
596 assert(ident);
597 assert(journal_stream_dev);
598 assert(journal_stream_ino);
599
600 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
601
602 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
603 return -errno;
604
605 return STDOUT_FILENO;
606 }
607
608 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
609 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
610 return -errno;
611
612 return STDERR_FILENO;
613 }
614
615 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
616 o = fixup_output(context->std_output, socket_fd);
617
618 if (fileno == STDERR_FILENO) {
619 ExecOutput e;
620 e = fixup_output(context->std_error, socket_fd);
621
622 /* This expects the input and output are already set up */
623
624 /* Don't change the stderr file descriptor if we inherit all
625 * the way and are not on a tty */
626 if (e == EXEC_OUTPUT_INHERIT &&
627 o == EXEC_OUTPUT_INHERIT &&
628 i == EXEC_INPUT_NULL &&
629 !is_terminal_input(context->std_input) &&
630 getppid() != 1)
631 return fileno;
632
633 /* Duplicate from stdout if possible */
634 if (can_inherit_stderr_from_stdout(context, o, e))
635 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
636
637 o = e;
638
639 } else if (o == EXEC_OUTPUT_INHERIT) {
640 /* If input got downgraded, inherit the original value */
641 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
642 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
643
644 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
645 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
646 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
647
648 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
649 if (getppid() != 1)
650 return fileno;
651
652 /* We need to open /dev/null here anew, to get the right access mode. */
653 return open_null_as(O_WRONLY, fileno);
654 }
655
656 switch (o) {
657
658 case EXEC_OUTPUT_NULL:
659 return open_null_as(O_WRONLY, fileno);
660
661 case EXEC_OUTPUT_TTY:
662 if (is_terminal_input(i))
663 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
664
665 /* We don't reset the terminal if this is just about output */
666 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
667
668 case EXEC_OUTPUT_KMSG:
669 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
670 case EXEC_OUTPUT_JOURNAL:
671 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
672 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
673 if (r < 0) {
674 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
675 fileno == STDOUT_FILENO ? "stdout" : "stderr");
676 r = open_null_as(O_WRONLY, fileno);
677 } else {
678 struct stat st;
679
680 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
681 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
682 * services to detect whether they are connected to the journal or not.
683 *
684 * If both stdout and stderr are connected to a stream then let's make sure to store the data
685 * about STDERR as that's usually the best way to do logging. */
686
687 if (fstat(fileno, &st) >= 0 &&
688 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
689 *journal_stream_dev = st.st_dev;
690 *journal_stream_ino = st.st_ino;
691 }
692 }
693 return r;
694
695 case EXEC_OUTPUT_SOCKET:
696 assert(socket_fd >= 0);
697
698 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
699
700 case EXEC_OUTPUT_NAMED_FD:
701 assert(named_iofds[fileno] >= 0);
702
703 (void) fd_nonblock(named_iofds[fileno], false);
704 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
705
706 case EXEC_OUTPUT_FILE:
707 case EXEC_OUTPUT_FILE_APPEND:
708 case EXEC_OUTPUT_FILE_TRUNCATE: {
709 bool rw;
710 int fd, flags;
711
712 assert(context->stdio_file[fileno]);
713
714 rw = context->std_input == EXEC_INPUT_FILE &&
715 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
716
717 if (rw)
718 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
719
720 flags = O_WRONLY;
721 if (o == EXEC_OUTPUT_FILE_APPEND)
722 flags |= O_APPEND;
723 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
724 flags |= O_TRUNC;
725
726 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
727 if (fd < 0)
728 return fd;
729
730 return move_fd(fd, fileno, 0);
731 }
732
733 default:
734 assert_not_reached();
735 }
736 }
737
738 static int chown_terminal(int fd, uid_t uid) {
739 int r;
740
741 assert(fd >= 0);
742
743 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
744 if (isatty(fd) < 1) {
745 if (IN_SET(errno, EINVAL, ENOTTY))
746 return 0; /* not a tty */
747
748 return -errno;
749 }
750
751 /* This might fail. What matters are the results. */
752 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
753 if (r < 0)
754 return r;
755
756 return 1;
757 }
758
759 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
760 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
761 int r;
762
763 assert(_saved_stdin);
764 assert(_saved_stdout);
765
766 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
767 if (saved_stdin < 0)
768 return -errno;
769
770 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
771 if (saved_stdout < 0)
772 return -errno;
773
774 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
775 if (fd < 0)
776 return fd;
777
778 r = chown_terminal(fd, getuid());
779 if (r < 0)
780 return r;
781
782 r = reset_terminal_fd(fd, true);
783 if (r < 0)
784 return r;
785
786 r = rearrange_stdio(fd, fd, STDERR_FILENO);
787 fd = -1;
788 if (r < 0)
789 return r;
790
791 *_saved_stdin = saved_stdin;
792 *_saved_stdout = saved_stdout;
793
794 saved_stdin = saved_stdout = -1;
795
796 return 0;
797 }
798
799 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
800 assert(err < 0);
801
802 if (err == -ETIMEDOUT)
803 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
804 else {
805 errno = -err;
806 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
807 }
808 }
809
810 static void write_confirm_error(int err, const char *vc, const Unit *u) {
811 _cleanup_close_ int fd = -1;
812
813 assert(vc);
814
815 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
816 if (fd < 0)
817 return;
818
819 write_confirm_error_fd(err, fd, u);
820 }
821
822 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
823 int r = 0;
824
825 assert(saved_stdin);
826 assert(saved_stdout);
827
828 release_terminal();
829
830 if (*saved_stdin >= 0)
831 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
832 r = -errno;
833
834 if (*saved_stdout >= 0)
835 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
836 r = -errno;
837
838 *saved_stdin = safe_close(*saved_stdin);
839 *saved_stdout = safe_close(*saved_stdout);
840
841 return r;
842 }
843
844 enum {
845 CONFIRM_PRETEND_FAILURE = -1,
846 CONFIRM_PRETEND_SUCCESS = 0,
847 CONFIRM_EXECUTE = 1,
848 };
849
850 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
851 int saved_stdout = -1, saved_stdin = -1, r;
852 _cleanup_free_ char *e = NULL;
853 char c;
854
855 /* For any internal errors, assume a positive response. */
856 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
857 if (r < 0) {
858 write_confirm_error(r, vc, u);
859 return CONFIRM_EXECUTE;
860 }
861
862 /* confirm_spawn might have been disabled while we were sleeping. */
863 if (manager_is_confirm_spawn_disabled(u->manager)) {
864 r = 1;
865 goto restore_stdio;
866 }
867
868 e = ellipsize(cmdline, 60, 100);
869 if (!e) {
870 log_oom();
871 r = CONFIRM_EXECUTE;
872 goto restore_stdio;
873 }
874
875 for (;;) {
876 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
877 if (r < 0) {
878 write_confirm_error_fd(r, STDOUT_FILENO, u);
879 r = CONFIRM_EXECUTE;
880 goto restore_stdio;
881 }
882
883 switch (c) {
884 case 'c':
885 printf("Resuming normal execution.\n");
886 manager_disable_confirm_spawn();
887 r = 1;
888 break;
889 case 'D':
890 unit_dump(u, stdout, " ");
891 continue; /* ask again */
892 case 'f':
893 printf("Failing execution.\n");
894 r = CONFIRM_PRETEND_FAILURE;
895 break;
896 case 'h':
897 printf(" c - continue, proceed without asking anymore\n"
898 " D - dump, show the state of the unit\n"
899 " f - fail, don't execute the command and pretend it failed\n"
900 " h - help\n"
901 " i - info, show a short summary of the unit\n"
902 " j - jobs, show jobs that are in progress\n"
903 " s - skip, don't execute the command and pretend it succeeded\n"
904 " y - yes, execute the command\n");
905 continue; /* ask again */
906 case 'i':
907 printf(" Description: %s\n"
908 " Unit: %s\n"
909 " Command: %s\n",
910 u->id, u->description, cmdline);
911 continue; /* ask again */
912 case 'j':
913 manager_dump_jobs(u->manager, stdout, " ");
914 continue; /* ask again */
915 case 'n':
916 /* 'n' was removed in favor of 'f'. */
917 printf("Didn't understand 'n', did you mean 'f'?\n");
918 continue; /* ask again */
919 case 's':
920 printf("Skipping execution.\n");
921 r = CONFIRM_PRETEND_SUCCESS;
922 break;
923 case 'y':
924 r = CONFIRM_EXECUTE;
925 break;
926 default:
927 assert_not_reached();
928 }
929 break;
930 }
931
932 restore_stdio:
933 restore_confirm_stdio(&saved_stdin, &saved_stdout);
934 return r;
935 }
936
937 static int get_fixed_user(const ExecContext *c, const char **user,
938 uid_t *uid, gid_t *gid,
939 const char **home, const char **shell) {
940 int r;
941 const char *name;
942
943 assert(c);
944
945 if (!c->user)
946 return 0;
947
948 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
949 * (i.e. are "/" or "/bin/nologin"). */
950
951 name = c->user;
952 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
953 if (r < 0)
954 return r;
955
956 *user = name;
957 return 0;
958 }
959
960 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
961 int r;
962 const char *name;
963
964 assert(c);
965
966 if (!c->group)
967 return 0;
968
969 name = c->group;
970 r = get_group_creds(&name, gid, 0);
971 if (r < 0)
972 return r;
973
974 *group = name;
975 return 0;
976 }
977
978 static int get_supplementary_groups(const ExecContext *c, const char *user,
979 const char *group, gid_t gid,
980 gid_t **supplementary_gids, int *ngids) {
981 char **i;
982 int r, k = 0;
983 int ngroups_max;
984 bool keep_groups = false;
985 gid_t *groups = NULL;
986 _cleanup_free_ gid_t *l_gids = NULL;
987
988 assert(c);
989
990 /*
991 * If user is given, then lookup GID and supplementary groups list.
992 * We avoid NSS lookups for gid=0. Also we have to initialize groups
993 * here and as early as possible so we keep the list of supplementary
994 * groups of the caller.
995 */
996 if (user && gid_is_valid(gid) && gid != 0) {
997 /* First step, initialize groups from /etc/groups */
998 if (initgroups(user, gid) < 0)
999 return -errno;
1000
1001 keep_groups = true;
1002 }
1003
1004 if (strv_isempty(c->supplementary_groups))
1005 return 0;
1006
1007 /*
1008 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1009 * be positive, otherwise fail.
1010 */
1011 errno = 0;
1012 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1013 if (ngroups_max <= 0)
1014 return errno_or_else(EOPNOTSUPP);
1015
1016 l_gids = new(gid_t, ngroups_max);
1017 if (!l_gids)
1018 return -ENOMEM;
1019
1020 if (keep_groups) {
1021 /*
1022 * Lookup the list of groups that the user belongs to, we
1023 * avoid NSS lookups here too for gid=0.
1024 */
1025 k = ngroups_max;
1026 if (getgrouplist(user, gid, l_gids, &k) < 0)
1027 return -EINVAL;
1028 } else
1029 k = 0;
1030
1031 STRV_FOREACH(i, c->supplementary_groups) {
1032 const char *g;
1033
1034 if (k >= ngroups_max)
1035 return -E2BIG;
1036
1037 g = *i;
1038 r = get_group_creds(&g, l_gids+k, 0);
1039 if (r < 0)
1040 return r;
1041
1042 k++;
1043 }
1044
1045 /*
1046 * Sets ngids to zero to drop all supplementary groups, happens
1047 * when we are under root and SupplementaryGroups= is empty.
1048 */
1049 if (k == 0) {
1050 *ngids = 0;
1051 return 0;
1052 }
1053
1054 /* Otherwise get the final list of supplementary groups */
1055 groups = memdup(l_gids, sizeof(gid_t) * k);
1056 if (!groups)
1057 return -ENOMEM;
1058
1059 *supplementary_gids = groups;
1060 *ngids = k;
1061
1062 groups = NULL;
1063
1064 return 0;
1065 }
1066
1067 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1068 int r;
1069
1070 /* Handle SupplementaryGroups= if it is not empty */
1071 if (ngids > 0) {
1072 r = maybe_setgroups(ngids, supplementary_gids);
1073 if (r < 0)
1074 return r;
1075 }
1076
1077 if (gid_is_valid(gid)) {
1078 /* Then set our gids */
1079 if (setresgid(gid, gid, gid) < 0)
1080 return -errno;
1081 }
1082
1083 return 0;
1084 }
1085
1086 static int set_securebits(int bits, int mask) {
1087 int current, applied;
1088 current = prctl(PR_GET_SECUREBITS);
1089 if (current < 0)
1090 return -errno;
1091 /* Clear all securebits defined in mask and set bits */
1092 applied = (current & ~mask) | bits;
1093 if (current == applied)
1094 return 0;
1095 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1096 return -errno;
1097 return 1;
1098 }
1099
1100 static int enforce_user(const ExecContext *context, uid_t uid) {
1101 assert(context);
1102 int r;
1103
1104 if (!uid_is_valid(uid))
1105 return 0;
1106
1107 /* Sets (but doesn't look up) the uid and make sure we keep the
1108 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1109 * required, so we also need keep-caps in this case.
1110 */
1111
1112 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1113
1114 /* First step: If we need to keep capabilities but
1115 * drop privileges we need to make sure we keep our
1116 * caps, while we drop privileges. */
1117 if (uid != 0) {
1118 /* Add KEEP_CAPS to the securebits */
1119 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1120 if (r < 0)
1121 return r;
1122 }
1123 }
1124
1125 /* Second step: actually set the uids */
1126 if (setresuid(uid, uid, uid) < 0)
1127 return -errno;
1128
1129 /* At this point we should have all necessary capabilities but
1130 are otherwise a normal user. However, the caps might got
1131 corrupted due to the setresuid() so we need clean them up
1132 later. This is done outside of this call. */
1133
1134 return 0;
1135 }
1136
1137 #if HAVE_PAM
1138
1139 static int null_conv(
1140 int num_msg,
1141 const struct pam_message **msg,
1142 struct pam_response **resp,
1143 void *appdata_ptr) {
1144
1145 /* We don't support conversations */
1146
1147 return PAM_CONV_ERR;
1148 }
1149
1150 #endif
1151
1152 static int setup_pam(
1153 const char *name,
1154 const char *user,
1155 uid_t uid,
1156 gid_t gid,
1157 const char *tty,
1158 char ***env,
1159 const int fds[], size_t n_fds) {
1160
1161 #if HAVE_PAM
1162
1163 static const struct pam_conv conv = {
1164 .conv = null_conv,
1165 .appdata_ptr = NULL
1166 };
1167
1168 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1169 pam_handle_t *handle = NULL;
1170 sigset_t old_ss;
1171 int pam_code = PAM_SUCCESS, r;
1172 char **nv, **e = NULL;
1173 bool close_session = false;
1174 pid_t pam_pid = 0, parent_pid;
1175 int flags = 0;
1176
1177 assert(name);
1178 assert(user);
1179 assert(env);
1180
1181 /* We set up PAM in the parent process, then fork. The child
1182 * will then stay around until killed via PR_GET_PDEATHSIG or
1183 * systemd via the cgroup logic. It will then remove the PAM
1184 * session again. The parent process will exec() the actual
1185 * daemon. We do things this way to ensure that the main PID
1186 * of the daemon is the one we initially fork()ed. */
1187
1188 r = barrier_create(&barrier);
1189 if (r < 0)
1190 goto fail;
1191
1192 if (log_get_max_level() < LOG_DEBUG)
1193 flags |= PAM_SILENT;
1194
1195 pam_code = pam_start(name, user, &conv, &handle);
1196 if (pam_code != PAM_SUCCESS) {
1197 handle = NULL;
1198 goto fail;
1199 }
1200
1201 if (!tty) {
1202 _cleanup_free_ char *q = NULL;
1203
1204 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1205 * out if that's the case, and read the TTY off it. */
1206
1207 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1208 tty = strjoina("/dev/", q);
1209 }
1210
1211 if (tty) {
1212 pam_code = pam_set_item(handle, PAM_TTY, tty);
1213 if (pam_code != PAM_SUCCESS)
1214 goto fail;
1215 }
1216
1217 STRV_FOREACH(nv, *env) {
1218 pam_code = pam_putenv(handle, *nv);
1219 if (pam_code != PAM_SUCCESS)
1220 goto fail;
1221 }
1222
1223 pam_code = pam_acct_mgmt(handle, flags);
1224 if (pam_code != PAM_SUCCESS)
1225 goto fail;
1226
1227 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1228 if (pam_code != PAM_SUCCESS)
1229 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1230
1231 pam_code = pam_open_session(handle, flags);
1232 if (pam_code != PAM_SUCCESS)
1233 goto fail;
1234
1235 close_session = true;
1236
1237 e = pam_getenvlist(handle);
1238 if (!e) {
1239 pam_code = PAM_BUF_ERR;
1240 goto fail;
1241 }
1242
1243 /* Block SIGTERM, so that we know that it won't get lost in
1244 * the child */
1245
1246 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1247
1248 parent_pid = getpid_cached();
1249
1250 r = safe_fork("(sd-pam)", 0, &pam_pid);
1251 if (r < 0)
1252 goto fail;
1253 if (r == 0) {
1254 int sig, ret = EXIT_PAM;
1255
1256 /* The child's job is to reset the PAM session on
1257 * termination */
1258 barrier_set_role(&barrier, BARRIER_CHILD);
1259
1260 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1261 * those fds are open here that have been opened by PAM. */
1262 (void) close_many(fds, n_fds);
1263
1264 /* Drop privileges - we don't need any to pam_close_session
1265 * and this will make PR_SET_PDEATHSIG work in most cases.
1266 * If this fails, ignore the error - but expect sd-pam threads
1267 * to fail to exit normally */
1268
1269 r = maybe_setgroups(0, NULL);
1270 if (r < 0)
1271 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1272 if (setresgid(gid, gid, gid) < 0)
1273 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1274 if (setresuid(uid, uid, uid) < 0)
1275 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1276
1277 (void) ignore_signals(SIGPIPE);
1278
1279 /* Wait until our parent died. This will only work if
1280 * the above setresuid() succeeds, otherwise the kernel
1281 * will not allow unprivileged parents kill their privileged
1282 * children this way. We rely on the control groups kill logic
1283 * to do the rest for us. */
1284 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1285 goto child_finish;
1286
1287 /* Tell the parent that our setup is done. This is especially
1288 * important regarding dropping privileges. Otherwise, unit
1289 * setup might race against our setresuid(2) call.
1290 *
1291 * If the parent aborted, we'll detect this below, hence ignore
1292 * return failure here. */
1293 (void) barrier_place(&barrier);
1294
1295 /* Check if our parent process might already have died? */
1296 if (getppid() == parent_pid) {
1297 sigset_t ss;
1298
1299 assert_se(sigemptyset(&ss) >= 0);
1300 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1301
1302 for (;;) {
1303 if (sigwait(&ss, &sig) < 0) {
1304 if (errno == EINTR)
1305 continue;
1306
1307 goto child_finish;
1308 }
1309
1310 assert(sig == SIGTERM);
1311 break;
1312 }
1313 }
1314
1315 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1316 if (pam_code != PAM_SUCCESS)
1317 goto child_finish;
1318
1319 /* If our parent died we'll end the session */
1320 if (getppid() != parent_pid) {
1321 pam_code = pam_close_session(handle, flags);
1322 if (pam_code != PAM_SUCCESS)
1323 goto child_finish;
1324 }
1325
1326 ret = 0;
1327
1328 child_finish:
1329 pam_end(handle, pam_code | flags);
1330 _exit(ret);
1331 }
1332
1333 barrier_set_role(&barrier, BARRIER_PARENT);
1334
1335 /* If the child was forked off successfully it will do all the
1336 * cleanups, so forget about the handle here. */
1337 handle = NULL;
1338
1339 /* Unblock SIGTERM again in the parent */
1340 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1341
1342 /* We close the log explicitly here, since the PAM modules
1343 * might have opened it, but we don't want this fd around. */
1344 closelog();
1345
1346 /* Synchronously wait for the child to initialize. We don't care for
1347 * errors as we cannot recover. However, warn loudly if it happens. */
1348 if (!barrier_place_and_sync(&barrier))
1349 log_error("PAM initialization failed");
1350
1351 return strv_free_and_replace(*env, e);
1352
1353 fail:
1354 if (pam_code != PAM_SUCCESS) {
1355 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1356 r = -EPERM; /* PAM errors do not map to errno */
1357 } else
1358 log_error_errno(r, "PAM failed: %m");
1359
1360 if (handle) {
1361 if (close_session)
1362 pam_code = pam_close_session(handle, flags);
1363
1364 pam_end(handle, pam_code | flags);
1365 }
1366
1367 strv_free(e);
1368 closelog();
1369
1370 return r;
1371 #else
1372 return 0;
1373 #endif
1374 }
1375
1376 static void rename_process_from_path(const char *path) {
1377 char process_name[11];
1378 const char *p;
1379 size_t l;
1380
1381 /* This resulting string must fit in 10 chars (i.e. the length
1382 * of "/sbin/init") to look pretty in /bin/ps */
1383
1384 p = basename(path);
1385 if (isempty(p)) {
1386 rename_process("(...)");
1387 return;
1388 }
1389
1390 l = strlen(p);
1391 if (l > 8) {
1392 /* The end of the process name is usually more
1393 * interesting, since the first bit might just be
1394 * "systemd-" */
1395 p = p + l - 8;
1396 l = 8;
1397 }
1398
1399 process_name[0] = '(';
1400 memcpy(process_name+1, p, l);
1401 process_name[1+l] = ')';
1402 process_name[1+l+1] = 0;
1403
1404 rename_process(process_name);
1405 }
1406
1407 static bool context_has_address_families(const ExecContext *c) {
1408 assert(c);
1409
1410 return c->address_families_allow_list ||
1411 !set_isempty(c->address_families);
1412 }
1413
1414 static bool context_has_syscall_filters(const ExecContext *c) {
1415 assert(c);
1416
1417 return c->syscall_allow_list ||
1418 !hashmap_isempty(c->syscall_filter);
1419 }
1420
1421 static bool context_has_syscall_logs(const ExecContext *c) {
1422 assert(c);
1423
1424 return c->syscall_log_allow_list ||
1425 !hashmap_isempty(c->syscall_log);
1426 }
1427
1428 static bool context_has_no_new_privileges(const ExecContext *c) {
1429 assert(c);
1430
1431 if (c->no_new_privileges)
1432 return true;
1433
1434 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1435 return false;
1436
1437 /* We need NNP if we have any form of seccomp and are unprivileged */
1438 return c->lock_personality ||
1439 c->memory_deny_write_execute ||
1440 c->private_devices ||
1441 c->protect_clock ||
1442 c->protect_hostname ||
1443 c->protect_kernel_tunables ||
1444 c->protect_kernel_modules ||
1445 c->protect_kernel_logs ||
1446 context_has_address_families(c) ||
1447 exec_context_restrict_namespaces_set(c) ||
1448 c->restrict_realtime ||
1449 c->restrict_suid_sgid ||
1450 !set_isempty(c->syscall_archs) ||
1451 context_has_syscall_filters(c) ||
1452 context_has_syscall_logs(c);
1453 }
1454
1455 static bool exec_context_has_credentials(const ExecContext *context) {
1456
1457 assert(context);
1458
1459 return !hashmap_isempty(context->set_credentials) ||
1460 !hashmap_isempty(context->load_credentials);
1461 }
1462
1463 #if HAVE_SECCOMP
1464
1465 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1466
1467 if (is_seccomp_available())
1468 return false;
1469
1470 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1471 return true;
1472 }
1473
1474 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1475 uint32_t negative_action, default_action, action;
1476 int r;
1477
1478 assert(u);
1479 assert(c);
1480
1481 if (!context_has_syscall_filters(c))
1482 return 0;
1483
1484 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1485 return 0;
1486
1487 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1488
1489 if (c->syscall_allow_list) {
1490 default_action = negative_action;
1491 action = SCMP_ACT_ALLOW;
1492 } else {
1493 default_action = SCMP_ACT_ALLOW;
1494 action = negative_action;
1495 }
1496
1497 if (needs_ambient_hack) {
1498 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1499 if (r < 0)
1500 return r;
1501 }
1502
1503 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1504 }
1505
1506 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1507 #ifdef SCMP_ACT_LOG
1508 uint32_t default_action, action;
1509 #endif
1510
1511 assert(u);
1512 assert(c);
1513
1514 if (!context_has_syscall_logs(c))
1515 return 0;
1516
1517 #ifdef SCMP_ACT_LOG
1518 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1519 return 0;
1520
1521 if (c->syscall_log_allow_list) {
1522 /* Log nothing but the ones listed */
1523 default_action = SCMP_ACT_ALLOW;
1524 action = SCMP_ACT_LOG;
1525 } else {
1526 /* Log everything but the ones listed */
1527 default_action = SCMP_ACT_LOG;
1528 action = SCMP_ACT_ALLOW;
1529 }
1530
1531 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1532 #else
1533 /* old libseccomp */
1534 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1535 return 0;
1536 #endif
1537 }
1538
1539 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1540 assert(u);
1541 assert(c);
1542
1543 if (set_isempty(c->syscall_archs))
1544 return 0;
1545
1546 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1547 return 0;
1548
1549 return seccomp_restrict_archs(c->syscall_archs);
1550 }
1551
1552 static int apply_address_families(const Unit* u, const ExecContext *c) {
1553 assert(u);
1554 assert(c);
1555
1556 if (!context_has_address_families(c))
1557 return 0;
1558
1559 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1560 return 0;
1561
1562 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1563 }
1564
1565 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1566 assert(u);
1567 assert(c);
1568
1569 if (!c->memory_deny_write_execute)
1570 return 0;
1571
1572 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1573 return 0;
1574
1575 return seccomp_memory_deny_write_execute();
1576 }
1577
1578 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1579 assert(u);
1580 assert(c);
1581
1582 if (!c->restrict_realtime)
1583 return 0;
1584
1585 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1586 return 0;
1587
1588 return seccomp_restrict_realtime();
1589 }
1590
1591 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1592 assert(u);
1593 assert(c);
1594
1595 if (!c->restrict_suid_sgid)
1596 return 0;
1597
1598 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1599 return 0;
1600
1601 return seccomp_restrict_suid_sgid();
1602 }
1603
1604 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1605 assert(u);
1606 assert(c);
1607
1608 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1609 * let's protect even those systems where this is left on in the kernel. */
1610
1611 if (!c->protect_kernel_tunables)
1612 return 0;
1613
1614 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1615 return 0;
1616
1617 return seccomp_protect_sysctl();
1618 }
1619
1620 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1621 assert(u);
1622 assert(c);
1623
1624 /* Turn off module syscalls on ProtectKernelModules=yes */
1625
1626 if (!c->protect_kernel_modules)
1627 return 0;
1628
1629 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1630 return 0;
1631
1632 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1633 }
1634
1635 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1636 assert(u);
1637 assert(c);
1638
1639 if (!c->protect_kernel_logs)
1640 return 0;
1641
1642 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1643 return 0;
1644
1645 return seccomp_protect_syslog();
1646 }
1647
1648 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1649 assert(u);
1650 assert(c);
1651
1652 if (!c->protect_clock)
1653 return 0;
1654
1655 if (skip_seccomp_unavailable(u, "ProtectClock="))
1656 return 0;
1657
1658 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1659 }
1660
1661 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1662 assert(u);
1663 assert(c);
1664
1665 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1666
1667 if (!c->private_devices)
1668 return 0;
1669
1670 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1671 return 0;
1672
1673 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1674 }
1675
1676 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1677 assert(u);
1678 assert(c);
1679
1680 if (!exec_context_restrict_namespaces_set(c))
1681 return 0;
1682
1683 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1684 return 0;
1685
1686 return seccomp_restrict_namespaces(c->restrict_namespaces);
1687 }
1688
1689 #if HAVE_LIBBPF
1690 static bool skip_lsm_bpf_unsupported(const Unit* u, const char* msg) {
1691 if (lsm_bpf_supported())
1692 return false;
1693
1694 log_unit_debug(u, "LSM BPF not supported, skipping %s", msg);
1695 return true;
1696 }
1697
1698 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1699 assert(u);
1700 assert(c);
1701
1702 if (!exec_context_restrict_filesystems_set(c))
1703 return 0;
1704
1705 if (skip_lsm_bpf_unsupported(u, "RestrictFileSystems="))
1706 return 0;
1707
1708 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1709 }
1710 #endif
1711
1712 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1713 unsigned long personality;
1714 int r;
1715
1716 assert(u);
1717 assert(c);
1718
1719 if (!c->lock_personality)
1720 return 0;
1721
1722 if (skip_seccomp_unavailable(u, "LockPersonality="))
1723 return 0;
1724
1725 personality = c->personality;
1726
1727 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1728 if (personality == PERSONALITY_INVALID) {
1729
1730 r = opinionated_personality(&personality);
1731 if (r < 0)
1732 return r;
1733 }
1734
1735 return seccomp_lock_personality(personality);
1736 }
1737
1738 #endif
1739
1740 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1741 assert(u);
1742 assert(c);
1743
1744 if (!c->protect_hostname)
1745 return 0;
1746
1747 if (ns_type_supported(NAMESPACE_UTS)) {
1748 if (unshare(CLONE_NEWUTS) < 0) {
1749 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1750 *ret_exit_status = EXIT_NAMESPACE;
1751 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1752 }
1753
1754 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1755 }
1756 } else
1757 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1758
1759 #if HAVE_SECCOMP
1760 int r;
1761
1762 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1763 return 0;
1764
1765 r = seccomp_protect_hostname();
1766 if (r < 0) {
1767 *ret_exit_status = EXIT_SECCOMP;
1768 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1769 }
1770 #endif
1771
1772 return 0;
1773 }
1774
1775 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1776 assert(idle_pipe);
1777
1778 idle_pipe[1] = safe_close(idle_pipe[1]);
1779 idle_pipe[2] = safe_close(idle_pipe[2]);
1780
1781 if (idle_pipe[0] >= 0) {
1782 int r;
1783
1784 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1785
1786 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1787 ssize_t n;
1788
1789 /* Signal systemd that we are bored and want to continue. */
1790 n = write(idle_pipe[3], "x", 1);
1791 if (n > 0)
1792 /* Wait for systemd to react to the signal above. */
1793 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1794 }
1795
1796 idle_pipe[0] = safe_close(idle_pipe[0]);
1797
1798 }
1799
1800 idle_pipe[3] = safe_close(idle_pipe[3]);
1801 }
1802
1803 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1804
1805 static int build_environment(
1806 const Unit *u,
1807 const ExecContext *c,
1808 const ExecParameters *p,
1809 size_t n_fds,
1810 const char *home,
1811 const char *username,
1812 const char *shell,
1813 dev_t journal_stream_dev,
1814 ino_t journal_stream_ino,
1815 char ***ret) {
1816
1817 _cleanup_strv_free_ char **our_env = NULL;
1818 size_t n_env = 0;
1819 char *x;
1820
1821 assert(u);
1822 assert(c);
1823 assert(p);
1824 assert(ret);
1825
1826 #define N_ENV_VARS 17
1827 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1828 if (!our_env)
1829 return -ENOMEM;
1830
1831 if (n_fds > 0) {
1832 _cleanup_free_ char *joined = NULL;
1833
1834 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1835 return -ENOMEM;
1836 our_env[n_env++] = x;
1837
1838 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1839 return -ENOMEM;
1840 our_env[n_env++] = x;
1841
1842 joined = strv_join(p->fd_names, ":");
1843 if (!joined)
1844 return -ENOMEM;
1845
1846 x = strjoin("LISTEN_FDNAMES=", joined);
1847 if (!x)
1848 return -ENOMEM;
1849 our_env[n_env++] = x;
1850 }
1851
1852 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1853 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1854 return -ENOMEM;
1855 our_env[n_env++] = x;
1856
1857 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1858 return -ENOMEM;
1859 our_env[n_env++] = x;
1860 }
1861
1862 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1863 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1864 * check the database directly. */
1865 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1866 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1867 if (!x)
1868 return -ENOMEM;
1869 our_env[n_env++] = x;
1870 }
1871
1872 if (home) {
1873 x = strjoin("HOME=", home);
1874 if (!x)
1875 return -ENOMEM;
1876
1877 path_simplify(x + 5);
1878 our_env[n_env++] = x;
1879 }
1880
1881 if (username) {
1882 x = strjoin("LOGNAME=", username);
1883 if (!x)
1884 return -ENOMEM;
1885 our_env[n_env++] = x;
1886
1887 x = strjoin("USER=", username);
1888 if (!x)
1889 return -ENOMEM;
1890 our_env[n_env++] = x;
1891 }
1892
1893 if (shell) {
1894 x = strjoin("SHELL=", shell);
1895 if (!x)
1896 return -ENOMEM;
1897
1898 path_simplify(x + 6);
1899 our_env[n_env++] = x;
1900 }
1901
1902 if (!sd_id128_is_null(u->invocation_id)) {
1903 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1904 return -ENOMEM;
1905
1906 our_env[n_env++] = x;
1907 }
1908
1909 if (exec_context_needs_term(c)) {
1910 const char *tty_path, *term = NULL;
1911
1912 tty_path = exec_context_tty_path(c);
1913
1914 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1915 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1916 * container manager passes to PID 1 ends up all the way in the console login shown. */
1917
1918 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1919 term = getenv("TERM");
1920
1921 if (!term)
1922 term = default_term_for_tty(tty_path);
1923
1924 x = strjoin("TERM=", term);
1925 if (!x)
1926 return -ENOMEM;
1927 our_env[n_env++] = x;
1928 }
1929
1930 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1931 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1932 return -ENOMEM;
1933
1934 our_env[n_env++] = x;
1935 }
1936
1937 if (c->log_namespace) {
1938 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1939 if (!x)
1940 return -ENOMEM;
1941
1942 our_env[n_env++] = x;
1943 }
1944
1945 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1946 _cleanup_free_ char *joined = NULL;
1947 const char *n;
1948
1949 if (!p->prefix[t])
1950 continue;
1951
1952 if (c->directories[t].n_items == 0)
1953 continue;
1954
1955 n = exec_directory_env_name_to_string(t);
1956 if (!n)
1957 continue;
1958
1959 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1960 _cleanup_free_ char *prefixed = NULL;
1961
1962 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1963 if (!prefixed)
1964 return -ENOMEM;
1965
1966 if (!strextend_with_separator(&joined, ":", prefixed))
1967 return -ENOMEM;
1968 }
1969
1970 x = strjoin(n, "=", joined);
1971 if (!x)
1972 return -ENOMEM;
1973
1974 our_env[n_env++] = x;
1975 }
1976
1977 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1978 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1979 if (!x)
1980 return -ENOMEM;
1981
1982 our_env[n_env++] = x;
1983 }
1984
1985 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1986 return -ENOMEM;
1987
1988 our_env[n_env++] = x;
1989
1990 our_env[n_env++] = NULL;
1991 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1992 #undef N_ENV_VARS
1993
1994 *ret = TAKE_PTR(our_env);
1995
1996 return 0;
1997 }
1998
1999 static int build_pass_environment(const ExecContext *c, char ***ret) {
2000 _cleanup_strv_free_ char **pass_env = NULL;
2001 size_t n_env = 0;
2002 char **i;
2003
2004 STRV_FOREACH(i, c->pass_environment) {
2005 _cleanup_free_ char *x = NULL;
2006 char *v;
2007
2008 v = getenv(*i);
2009 if (!v)
2010 continue;
2011 x = strjoin(*i, "=", v);
2012 if (!x)
2013 return -ENOMEM;
2014
2015 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2016 return -ENOMEM;
2017
2018 pass_env[n_env++] = TAKE_PTR(x);
2019 pass_env[n_env] = NULL;
2020 }
2021
2022 *ret = TAKE_PTR(pass_env);
2023
2024 return 0;
2025 }
2026
2027 bool exec_needs_mount_namespace(
2028 const ExecContext *context,
2029 const ExecParameters *params,
2030 const ExecRuntime *runtime) {
2031
2032 assert(context);
2033
2034 if (context->root_image)
2035 return true;
2036
2037 if (!strv_isempty(context->read_write_paths) ||
2038 !strv_isempty(context->read_only_paths) ||
2039 !strv_isempty(context->inaccessible_paths) ||
2040 !strv_isempty(context->exec_paths) ||
2041 !strv_isempty(context->no_exec_paths))
2042 return true;
2043
2044 if (context->n_bind_mounts > 0)
2045 return true;
2046
2047 if (context->n_temporary_filesystems > 0)
2048 return true;
2049
2050 if (context->n_mount_images > 0)
2051 return true;
2052
2053 if (context->n_extension_images > 0)
2054 return true;
2055
2056 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2057 return true;
2058
2059 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2060 return true;
2061
2062 if (context->private_devices ||
2063 context->private_mounts ||
2064 context->protect_system != PROTECT_SYSTEM_NO ||
2065 context->protect_home != PROTECT_HOME_NO ||
2066 context->protect_kernel_tunables ||
2067 context->protect_kernel_modules ||
2068 context->protect_kernel_logs ||
2069 context->protect_control_groups ||
2070 context->protect_proc != PROTECT_PROC_DEFAULT ||
2071 context->proc_subset != PROC_SUBSET_ALL ||
2072 context->private_ipc ||
2073 context->ipc_namespace_path)
2074 return true;
2075
2076 if (context->root_directory) {
2077 if (exec_context_get_effective_mount_apivfs(context))
2078 return true;
2079
2080 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2081 if (params && !params->prefix[t])
2082 continue;
2083
2084 if (context->directories[t].n_items > 0)
2085 return true;
2086 }
2087 }
2088
2089 if (context->dynamic_user &&
2090 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2091 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2092 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2093 return true;
2094
2095 if (context->log_namespace)
2096 return true;
2097
2098 return false;
2099 }
2100
2101 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2102 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2103 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2104 _cleanup_close_ int unshare_ready_fd = -1;
2105 _cleanup_(sigkill_waitp) pid_t pid = 0;
2106 uint64_t c = 1;
2107 ssize_t n;
2108 int r;
2109
2110 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2111 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2112 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2113 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2114 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2115 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2116 * continues execution normally.
2117 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2118 * does not need CAP_SETUID to write the single line mapping to itself. */
2119
2120 /* Can only set up multiple mappings with CAP_SETUID. */
2121 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2122 r = asprintf(&uid_map,
2123 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2124 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2125 ouid, ouid, uid, uid);
2126 else
2127 r = asprintf(&uid_map,
2128 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2129 ouid, ouid);
2130
2131 if (r < 0)
2132 return -ENOMEM;
2133
2134 /* Can only set up multiple mappings with CAP_SETGID. */
2135 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2136 r = asprintf(&gid_map,
2137 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2138 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2139 ogid, ogid, gid, gid);
2140 else
2141 r = asprintf(&gid_map,
2142 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2143 ogid, ogid);
2144
2145 if (r < 0)
2146 return -ENOMEM;
2147
2148 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2149 * namespace. */
2150 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2151 if (unshare_ready_fd < 0)
2152 return -errno;
2153
2154 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2155 * failed. */
2156 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2157 return -errno;
2158
2159 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2160 if (r < 0)
2161 return r;
2162 if (r == 0) {
2163 _cleanup_close_ int fd = -1;
2164 const char *a;
2165 pid_t ppid;
2166
2167 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2168 * here, after the parent opened its own user namespace. */
2169
2170 ppid = getppid();
2171 errno_pipe[0] = safe_close(errno_pipe[0]);
2172
2173 /* Wait until the parent unshared the user namespace */
2174 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2175 r = -errno;
2176 goto child_fail;
2177 }
2178
2179 /* Disable the setgroups() system call in the child user namespace, for good. */
2180 a = procfs_file_alloca(ppid, "setgroups");
2181 fd = open(a, O_WRONLY|O_CLOEXEC);
2182 if (fd < 0) {
2183 if (errno != ENOENT) {
2184 r = -errno;
2185 goto child_fail;
2186 }
2187
2188 /* If the file is missing the kernel is too old, let's continue anyway. */
2189 } else {
2190 if (write(fd, "deny\n", 5) < 0) {
2191 r = -errno;
2192 goto child_fail;
2193 }
2194
2195 fd = safe_close(fd);
2196 }
2197
2198 /* First write the GID map */
2199 a = procfs_file_alloca(ppid, "gid_map");
2200 fd = open(a, O_WRONLY|O_CLOEXEC);
2201 if (fd < 0) {
2202 r = -errno;
2203 goto child_fail;
2204 }
2205 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2206 r = -errno;
2207 goto child_fail;
2208 }
2209 fd = safe_close(fd);
2210
2211 /* The write the UID map */
2212 a = procfs_file_alloca(ppid, "uid_map");
2213 fd = open(a, O_WRONLY|O_CLOEXEC);
2214 if (fd < 0) {
2215 r = -errno;
2216 goto child_fail;
2217 }
2218 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2219 r = -errno;
2220 goto child_fail;
2221 }
2222
2223 _exit(EXIT_SUCCESS);
2224
2225 child_fail:
2226 (void) write(errno_pipe[1], &r, sizeof(r));
2227 _exit(EXIT_FAILURE);
2228 }
2229
2230 errno_pipe[1] = safe_close(errno_pipe[1]);
2231
2232 if (unshare(CLONE_NEWUSER) < 0)
2233 return -errno;
2234
2235 /* Let the child know that the namespace is ready now */
2236 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2237 return -errno;
2238
2239 /* Try to read an error code from the child */
2240 n = read(errno_pipe[0], &r, sizeof(r));
2241 if (n < 0)
2242 return -errno;
2243 if (n == sizeof(r)) { /* an error code was sent to us */
2244 if (r < 0)
2245 return r;
2246 return -EIO;
2247 }
2248 if (n != 0) /* on success we should have read 0 bytes */
2249 return -EIO;
2250
2251 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2252 pid = 0;
2253 if (r < 0)
2254 return r;
2255 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2256 return -EIO;
2257
2258 return 0;
2259 }
2260
2261 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2262 if (!context->dynamic_user)
2263 return false;
2264
2265 if (type == EXEC_DIRECTORY_CONFIGURATION)
2266 return false;
2267
2268 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2269 return false;
2270
2271 return true;
2272 }
2273
2274 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2275 _cleanup_free_ char *src_abs = NULL;
2276 char **dst;
2277 int r;
2278
2279 assert(source);
2280
2281 src_abs = path_join(root, source);
2282 if (!src_abs)
2283 return -ENOMEM;
2284
2285 STRV_FOREACH(dst, symlinks) {
2286 _cleanup_free_ char *dst_abs = NULL;
2287
2288 dst_abs = path_join(root, *dst);
2289 if (!dst_abs)
2290 return -ENOMEM;
2291
2292 r = mkdir_parents_label(dst_abs, 0755);
2293 if (r < 0)
2294 return r;
2295
2296 r = symlink_idempotent(src_abs, dst_abs, true);
2297 if (r < 0)
2298 return r;
2299 }
2300
2301 return 0;
2302 }
2303
2304 static int setup_exec_directory(
2305 const ExecContext *context,
2306 const ExecParameters *params,
2307 uid_t uid,
2308 gid_t gid,
2309 ExecDirectoryType type,
2310 bool needs_mount_namespace,
2311 int *exit_status) {
2312
2313 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2314 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2315 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2316 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2317 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2318 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2319 };
2320 int r;
2321
2322 assert(context);
2323 assert(params);
2324 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2325 assert(exit_status);
2326
2327 if (!params->prefix[type])
2328 return 0;
2329
2330 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2331 if (!uid_is_valid(uid))
2332 uid = 0;
2333 if (!gid_is_valid(gid))
2334 gid = 0;
2335 }
2336
2337 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2338 _cleanup_free_ char *p = NULL, *pp = NULL;
2339
2340 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2341 if (!p) {
2342 r = -ENOMEM;
2343 goto fail;
2344 }
2345
2346 r = mkdir_parents_label(p, 0755);
2347 if (r < 0)
2348 goto fail;
2349
2350 if (exec_directory_is_private(context, type)) {
2351 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2352 * case we want to avoid leaving a directory around fully accessible that is owned by
2353 * a dynamic user whose UID is later on reused. To lock this down we use the same
2354 * trick used by container managers to prohibit host users to get access to files of
2355 * the same UID in containers: we place everything inside a directory that has an
2356 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2357 * for unprivileged host code. We then use fs namespacing to make this directory
2358 * permeable for the service itself.
2359 *
2360 * Specifically: for a service which wants a special directory "foo/" we first create
2361 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2362 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2363 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2364 * unprivileged host users can't look into it. Inside of the namespace of the unit
2365 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2366 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2367 * for the service and making sure it only gets access to the dirs it needs but no
2368 * others. Tricky? Yes, absolutely, but it works!
2369 *
2370 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2371 * to be owned by the service itself.
2372 *
2373 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2374 * for sharing files or sockets with other services. */
2375
2376 pp = path_join(params->prefix[type], "private");
2377 if (!pp) {
2378 r = -ENOMEM;
2379 goto fail;
2380 }
2381
2382 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2383 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2384 if (r < 0)
2385 goto fail;
2386
2387 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2388 r = -ENOMEM;
2389 goto fail;
2390 }
2391
2392 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2393 r = mkdir_parents_label(pp, 0755);
2394 if (r < 0)
2395 goto fail;
2396
2397 if (is_dir(p, false) > 0 &&
2398 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2399
2400 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2401 * it over. Most likely the service has been upgraded from one that didn't use
2402 * DynamicUser=1, to one that does. */
2403
2404 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2405 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2406 exec_directory_type_to_string(type), p, pp);
2407
2408 if (rename(p, pp) < 0) {
2409 r = -errno;
2410 goto fail;
2411 }
2412 } else {
2413 /* Otherwise, create the actual directory for the service */
2414
2415 r = mkdir_label(pp, context->directories[type].mode);
2416 if (r < 0 && r != -EEXIST)
2417 goto fail;
2418 }
2419
2420 /* And link it up from the original place. Note that if a mount namespace is going to be
2421 * used, then this symlink remains on the host, and a new one for the child namespace will
2422 * be created later. */
2423 r = symlink_idempotent(pp, p, true);
2424 if (r < 0)
2425 goto fail;
2426
2427 } else {
2428 _cleanup_free_ char *target = NULL;
2429
2430 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2431 readlink_and_make_absolute(p, &target) >= 0) {
2432 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2433
2434 /* This already exists and is a symlink? Interesting. Maybe it's one created
2435 * by DynamicUser=1 (see above)?
2436 *
2437 * We do this for all directory types except for ConfigurationDirectory=,
2438 * since they all support the private/ symlink logic at least in some
2439 * configurations, see above. */
2440
2441 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2442 if (r < 0)
2443 goto fail;
2444
2445 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2446 if (!q) {
2447 r = -ENOMEM;
2448 goto fail;
2449 }
2450
2451 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2452 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2453 if (r < 0)
2454 goto fail;
2455
2456 if (path_equal(q_resolved, target_resolved)) {
2457
2458 /* Hmm, apparently DynamicUser= was once turned on for this service,
2459 * but is no longer. Let's move the directory back up. */
2460
2461 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2462 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2463 exec_directory_type_to_string(type), q, p);
2464
2465 if (unlink(p) < 0) {
2466 r = -errno;
2467 goto fail;
2468 }
2469
2470 if (rename(q, p) < 0) {
2471 r = -errno;
2472 goto fail;
2473 }
2474 }
2475 }
2476
2477 r = mkdir_label(p, context->directories[type].mode);
2478 if (r < 0) {
2479 if (r != -EEXIST)
2480 goto fail;
2481
2482 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2483 struct stat st;
2484
2485 /* Don't change the owner/access mode of the configuration directory,
2486 * as in the common case it is not written to by a service, and shall
2487 * not be writable. */
2488
2489 if (stat(p, &st) < 0) {
2490 r = -errno;
2491 goto fail;
2492 }
2493
2494 /* Still complain if the access mode doesn't match */
2495 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2496 log_warning("%s \'%s\' already exists but the mode is different. "
2497 "(File system: %o %sMode: %o)",
2498 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2499 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2500
2501 continue;
2502 }
2503 }
2504 }
2505
2506 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2507 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2508 * current UID/GID ownership.) */
2509 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2510 if (r < 0)
2511 goto fail;
2512
2513 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2514 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2515 * assignments to exist. */
2516 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2517 if (r < 0)
2518 goto fail;
2519 }
2520
2521 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2522 * they are set up later, to allow configuring empty var/run/etc. */
2523 if (!needs_mount_namespace)
2524 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2525 r = create_many_symlinks(params->prefix[type],
2526 context->directories[type].items[i].path,
2527 context->directories[type].items[i].symlinks);
2528 if (r < 0)
2529 goto fail;
2530 }
2531
2532 return 0;
2533
2534 fail:
2535 *exit_status = exit_status_table[type];
2536 return r;
2537 }
2538
2539 static int write_credential(
2540 int dfd,
2541 const char *id,
2542 const void *data,
2543 size_t size,
2544 uid_t uid,
2545 bool ownership_ok) {
2546
2547 _cleanup_(unlink_and_freep) char *tmp = NULL;
2548 _cleanup_close_ int fd = -1;
2549 int r;
2550
2551 r = tempfn_random_child("", "cred", &tmp);
2552 if (r < 0)
2553 return r;
2554
2555 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2556 if (fd < 0) {
2557 tmp = mfree(tmp);
2558 return -errno;
2559 }
2560
2561 r = loop_write(fd, data, size, /* do_poll = */ false);
2562 if (r < 0)
2563 return r;
2564
2565 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2566 return -errno;
2567
2568 if (uid_is_valid(uid) && uid != getuid()) {
2569 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2570 if (r < 0) {
2571 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2572 return r;
2573
2574 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2575 * to express: that the user gets read access and nothing
2576 * else. But if the backing fs can't support that (e.g. ramfs)
2577 * then we can use file ownership instead. But that's only safe if
2578 * we can then re-mount the whole thing read-only, so that the
2579 * user can no longer chmod() the file to gain write access. */
2580 return r;
2581
2582 if (fchown(fd, uid, GID_INVALID) < 0)
2583 return -errno;
2584 }
2585 }
2586
2587 if (renameat(dfd, tmp, dfd, id) < 0)
2588 return -errno;
2589
2590 tmp = mfree(tmp);
2591 return 0;
2592 }
2593
2594 static int acquire_credentials(
2595 const ExecContext *context,
2596 const ExecParameters *params,
2597 const char *unit,
2598 const char *p,
2599 uid_t uid,
2600 bool ownership_ok) {
2601
2602 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2603 _cleanup_close_ int dfd = -1;
2604 ExecLoadCredential *lc;
2605 ExecSetCredential *sc;
2606 int r;
2607
2608 assert(context);
2609 assert(p);
2610
2611 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2612 if (dfd < 0)
2613 return -errno;
2614
2615 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2616 HASHMAP_FOREACH(lc, context->load_credentials) {
2617 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2618 _cleanup_(erase_and_freep) char *data = NULL;
2619 _cleanup_free_ char *j = NULL, *bindname = NULL;
2620 bool missing_ok = true;
2621 const char *source;
2622 size_t size, add;
2623
2624 if (path_is_absolute(lc->path)) {
2625 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2626 source = lc->path;
2627 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2628
2629 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2630 * via the source socket address in case we read off an AF_UNIX socket. */
2631 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, lc->id) < 0)
2632 return -ENOMEM;
2633
2634 missing_ok = false;
2635
2636 } else if (params->received_credentials) {
2637 /* If this is a relative path, take it relative to the credentials we received
2638 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2639 * on a credential store, i.e. this is guaranteed to be regular files. */
2640 j = path_join(params->received_credentials, lc->path);
2641 if (!j)
2642 return -ENOMEM;
2643
2644 source = j;
2645 } else
2646 source = NULL;
2647
2648 if (source)
2649 r = read_full_file_full(
2650 AT_FDCWD, source,
2651 UINT64_MAX,
2652 lc->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2653 flags | (lc->encrypted ? READ_FULL_FILE_UNBASE64 : 0),
2654 bindname,
2655 &data, &size);
2656 else
2657 r = -ENOENT;
2658 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, lc->id))) {
2659 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2660 * will get clear errors if we don't pass such a missing credential on as they
2661 * themselves will get ENOENT when trying to read them, which should not be much
2662 * worse than when we handle the error here and make it fatal.
2663 *
2664 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2665 * we are fine, too. */
2666 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", lc->path);
2667 continue;
2668 }
2669 if (r < 0)
2670 return log_debug_errno(r, "Failed to read credential '%s': %m", lc->path);
2671
2672 if (lc->encrypted) {
2673 _cleanup_free_ void *plaintext = NULL;
2674 size_t plaintext_size = 0;
2675
2676 r = decrypt_credential_and_warn(lc->id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2677 if (r < 0)
2678 return r;
2679
2680 free_and_replace(data, plaintext);
2681 size = plaintext_size;
2682 }
2683
2684 add = strlen(lc->id) + size;
2685 if (add > left)
2686 return -E2BIG;
2687
2688 r = write_credential(dfd, lc->id, data, size, uid, ownership_ok);
2689 if (r < 0)
2690 return r;
2691
2692 left -= add;
2693 }
2694
2695 /* First we use the literally specified credentials. Note that they might be overridden again below,
2696 * and thus act as a "default" if the same credential is specified multiple times */
2697 HASHMAP_FOREACH(sc, context->set_credentials) {
2698 _cleanup_(erase_and_freep) void *plaintext = NULL;
2699 const char *data;
2700 size_t size, add;
2701
2702 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2703 continue;
2704 if (errno != ENOENT)
2705 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2706
2707 if (sc->encrypted) {
2708 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2709 if (r < 0)
2710 return r;
2711
2712 data = plaintext;
2713 } else {
2714 data = sc->data;
2715 size = sc->size;
2716 }
2717
2718 add = strlen(sc->id) + size;
2719 if (add > left)
2720 return -E2BIG;
2721
2722 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2723 if (r < 0)
2724 return r;
2725
2726
2727 left -= add;
2728 }
2729
2730 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2731 return -errno;
2732
2733 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2734 * accessible */
2735
2736 if (uid_is_valid(uid) && uid != getuid()) {
2737 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2738 if (r < 0) {
2739 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2740 return r;
2741
2742 if (!ownership_ok)
2743 return r;
2744
2745 if (fchown(dfd, uid, GID_INVALID) < 0)
2746 return -errno;
2747 }
2748 }
2749
2750 return 0;
2751 }
2752
2753 static int setup_credentials_internal(
2754 const ExecContext *context,
2755 const ExecParameters *params,
2756 const char *unit,
2757 const char *final, /* This is where the credential store shall eventually end up at */
2758 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2759 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2760 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2761 uid_t uid) {
2762
2763 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2764 * if we mounted something; false if we definitely can't mount anything */
2765 bool final_mounted;
2766 const char *where;
2767
2768 assert(context);
2769 assert(final);
2770 assert(workspace);
2771
2772 if (reuse_workspace) {
2773 r = path_is_mount_point(workspace, NULL, 0);
2774 if (r < 0)
2775 return r;
2776 if (r > 0)
2777 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2778 else
2779 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2780 } else
2781 workspace_mounted = -1; /* ditto */
2782
2783 r = path_is_mount_point(final, NULL, 0);
2784 if (r < 0)
2785 return r;
2786 if (r > 0) {
2787 /* If the final place already has something mounted, we use that. If the workspace also has
2788 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2789 * different). */
2790 final_mounted = true;
2791
2792 if (workspace_mounted < 0) {
2793 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2794 * the final version to the workspace, and make it writable, so that we can make
2795 * changes */
2796
2797 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2798 if (r < 0)
2799 return r;
2800
2801 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2802 if (r < 0)
2803 return r;
2804
2805 workspace_mounted = true;
2806 }
2807 } else
2808 final_mounted = false;
2809
2810 if (workspace_mounted < 0) {
2811 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2812 for (int try = 0;; try++) {
2813
2814 if (try == 0) {
2815 /* Try "ramfs" first, since it's not swap backed */
2816 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2817 if (r >= 0) {
2818 workspace_mounted = true;
2819 break;
2820 }
2821
2822 } else if (try == 1) {
2823 _cleanup_free_ char *opts = NULL;
2824
2825 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
2826 return -ENOMEM;
2827
2828 /* Fall back to "tmpfs" otherwise */
2829 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2830 if (r >= 0) {
2831 workspace_mounted = true;
2832 break;
2833 }
2834
2835 } else {
2836 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
2837 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2838 if (r < 0) {
2839 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2840 return r;
2841
2842 if (must_mount) /* If we it's not OK to use the plain directory
2843 * fallback, propagate all errors too */
2844 return r;
2845
2846 /* If we lack privileges to bind mount stuff, then let's gracefully
2847 * proceed for compat with container envs, and just use the final dir
2848 * as is. */
2849
2850 workspace_mounted = false;
2851 break;
2852 }
2853
2854 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
2855 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2856 if (r < 0)
2857 return r;
2858
2859 workspace_mounted = true;
2860 break;
2861 }
2862 }
2863 }
2864
2865 assert(!must_mount || workspace_mounted > 0);
2866 where = workspace_mounted ? workspace : final;
2867
2868 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
2869 if (r < 0)
2870 return r;
2871
2872 if (workspace_mounted) {
2873 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
2874 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2875 if (r < 0)
2876 return r;
2877
2878 /* And mount it to the final place, read-only */
2879 if (final_mounted)
2880 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2881 else
2882 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2883 if (r < 0)
2884 return r;
2885 } else {
2886 _cleanup_free_ char *parent = NULL;
2887
2888 /* If we do not have our own mount put used the plain directory fallback, then we need to
2889 * open access to the top-level credential directory and the per-service directory now */
2890
2891 parent = dirname_malloc(final);
2892 if (!parent)
2893 return -ENOMEM;
2894 if (chmod(parent, 0755) < 0)
2895 return -errno;
2896 }
2897
2898 return 0;
2899 }
2900
2901 static int setup_credentials(
2902 const ExecContext *context,
2903 const ExecParameters *params,
2904 const char *unit,
2905 uid_t uid) {
2906
2907 _cleanup_free_ char *p = NULL, *q = NULL;
2908 const char *i;
2909 int r;
2910
2911 assert(context);
2912 assert(params);
2913
2914 if (!exec_context_has_credentials(context))
2915 return 0;
2916
2917 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2918 return -EINVAL;
2919
2920 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2921 * and the subdir we mount over with a read-only file system readable by the service's user */
2922 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2923 if (!q)
2924 return -ENOMEM;
2925
2926 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2927 if (r < 0 && r != -EEXIST)
2928 return r;
2929
2930 p = path_join(q, unit);
2931 if (!p)
2932 return -ENOMEM;
2933
2934 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2935 if (r < 0 && r != -EEXIST)
2936 return r;
2937
2938 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2939 if (r < 0) {
2940 _cleanup_free_ char *t = NULL, *u = NULL;
2941
2942 /* If this is not a privilege or support issue then propagate the error */
2943 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2944 return r;
2945
2946 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2947 * it into place, so that users can't access half-initialized credential stores. */
2948 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2949 if (!t)
2950 return -ENOMEM;
2951
2952 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2953 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2954 * after it is fully set up */
2955 u = path_join(t, unit);
2956 if (!u)
2957 return -ENOMEM;
2958
2959 FOREACH_STRING(i, t, u) {
2960 r = mkdir_label(i, 0700);
2961 if (r < 0 && r != -EEXIST)
2962 return r;
2963 }
2964
2965 r = setup_credentials_internal(
2966 context,
2967 params,
2968 unit,
2969 p, /* final mount point */
2970 u, /* temporary workspace to overmount */
2971 true, /* reuse the workspace if it is already a mount */
2972 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2973 uid);
2974
2975 (void) rmdir(u); /* remove the workspace again if we can. */
2976
2977 if (r < 0)
2978 return r;
2979
2980 } else if (r == 0) {
2981
2982 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2983 * we can use the same directory for all cases, after turning off propagation. Question
2984 * though is: where do we turn off propagation exactly, and where do we place the workspace
2985 * directory? We need some place that is guaranteed to be a mount point in the host, and
2986 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2987 * since we ultimately want to move the resulting file system there, i.e. we need propagation
2988 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2989 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2990 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2991 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2992 * propagation on the former, and then overmount the latter.
2993 *
2994 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2995 * for this purpose, but there are few other candidates that work equally well for us, and
2996 * given that the we do this in a privately namespaced short-lived single-threaded process
2997 * that no one else sees this should be OK to do. */
2998
2999 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3000 if (r < 0)
3001 goto child_fail;
3002
3003 r = setup_credentials_internal(
3004 context,
3005 params,
3006 unit,
3007 p, /* final mount point */
3008 "/dev/shm", /* temporary workspace to overmount */
3009 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3010 true, /* insist that something is mounted, do not allow fallback to plain directory */
3011 uid);
3012 if (r < 0)
3013 goto child_fail;
3014
3015 _exit(EXIT_SUCCESS);
3016
3017 child_fail:
3018 _exit(EXIT_FAILURE);
3019 }
3020
3021 return 0;
3022 }
3023
3024 #if ENABLE_SMACK
3025 static int setup_smack(
3026 const ExecContext *context,
3027 int executable_fd) {
3028 int r;
3029
3030 assert(context);
3031 assert(executable_fd >= 0);
3032
3033 if (context->smack_process_label) {
3034 r = mac_smack_apply_pid(0, context->smack_process_label);
3035 if (r < 0)
3036 return r;
3037 }
3038 #ifdef SMACK_DEFAULT_PROCESS_LABEL
3039 else {
3040 _cleanup_free_ char *exec_label = NULL;
3041
3042 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3043 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
3044 return r;
3045
3046 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3047 if (r < 0)
3048 return r;
3049 }
3050 #endif
3051
3052 return 0;
3053 }
3054 #endif
3055
3056 static int compile_bind_mounts(
3057 const ExecContext *context,
3058 const ExecParameters *params,
3059 BindMount **ret_bind_mounts,
3060 size_t *ret_n_bind_mounts,
3061 char ***ret_empty_directories) {
3062
3063 _cleanup_strv_free_ char **empty_directories = NULL;
3064 BindMount *bind_mounts;
3065 size_t n, h = 0;
3066 int r;
3067
3068 assert(context);
3069 assert(params);
3070 assert(ret_bind_mounts);
3071 assert(ret_n_bind_mounts);
3072 assert(ret_empty_directories);
3073
3074 n = context->n_bind_mounts;
3075 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3076 if (!params->prefix[t])
3077 continue;
3078
3079 n += context->directories[t].n_items;
3080 }
3081
3082 if (n <= 0) {
3083 *ret_bind_mounts = NULL;
3084 *ret_n_bind_mounts = 0;
3085 *ret_empty_directories = NULL;
3086 return 0;
3087 }
3088
3089 bind_mounts = new(BindMount, n);
3090 if (!bind_mounts)
3091 return -ENOMEM;
3092
3093 for (size_t i = 0; i < context->n_bind_mounts; i++) {
3094 BindMount *item = context->bind_mounts + i;
3095 char *s, *d;
3096
3097 s = strdup(item->source);
3098 if (!s) {
3099 r = -ENOMEM;
3100 goto finish;
3101 }
3102
3103 d = strdup(item->destination);
3104 if (!d) {
3105 free(s);
3106 r = -ENOMEM;
3107 goto finish;
3108 }
3109
3110 bind_mounts[h++] = (BindMount) {
3111 .source = s,
3112 .destination = d,
3113 .read_only = item->read_only,
3114 .recursive = item->recursive,
3115 .ignore_enoent = item->ignore_enoent,
3116 };
3117 }
3118
3119 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3120 if (!params->prefix[t])
3121 continue;
3122
3123 if (context->directories[t].n_items == 0)
3124 continue;
3125
3126 if (exec_directory_is_private(context, t) &&
3127 !exec_context_with_rootfs(context)) {
3128 char *private_root;
3129
3130 /* So this is for a dynamic user, and we need to make sure the process can access its own
3131 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3132 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3133
3134 private_root = path_join(params->prefix[t], "private");
3135 if (!private_root) {
3136 r = -ENOMEM;
3137 goto finish;
3138 }
3139
3140 r = strv_consume(&empty_directories, private_root);
3141 if (r < 0)
3142 goto finish;
3143 }
3144
3145 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3146 char *s, *d;
3147
3148 if (exec_directory_is_private(context, t))
3149 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3150 else
3151 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3152 if (!s) {
3153 r = -ENOMEM;
3154 goto finish;
3155 }
3156
3157 if (exec_directory_is_private(context, t) &&
3158 exec_context_with_rootfs(context))
3159 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3160 * directory is not created on the root directory. So, let's bind-mount the directory
3161 * on the 'non-private' place. */
3162 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3163 else
3164 d = strdup(s);
3165 if (!d) {
3166 free(s);
3167 r = -ENOMEM;
3168 goto finish;
3169 }
3170
3171 bind_mounts[h++] = (BindMount) {
3172 .source = s,
3173 .destination = d,
3174 .read_only = false,
3175 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3176 .recursive = true,
3177 .ignore_enoent = false,
3178 };
3179 }
3180 }
3181
3182 assert(h == n);
3183
3184 *ret_bind_mounts = bind_mounts;
3185 *ret_n_bind_mounts = n;
3186 *ret_empty_directories = TAKE_PTR(empty_directories);
3187
3188 return (int) n;
3189
3190 finish:
3191 bind_mount_free_many(bind_mounts, h);
3192 return r;
3193 }
3194
3195 /* ret_symlinks will contain a list of pairs src:dest that describes
3196 * the symlinks to create later on. For example, the symlinks needed
3197 * to safely give private directories to DynamicUser=1 users. */
3198 static int compile_symlinks(
3199 const ExecContext *context,
3200 const ExecParameters *params,
3201 char ***ret_symlinks) {
3202
3203 _cleanup_strv_free_ char **symlinks = NULL;
3204 int r;
3205
3206 assert(context);
3207 assert(params);
3208 assert(ret_symlinks);
3209
3210 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3211 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3212 _cleanup_free_ char *private_path = NULL, *path = NULL;
3213 char **symlink;
3214
3215 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3216 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3217
3218 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3219 dst_abs = path_join(params->prefix[dt], *symlink);
3220 if (!src_abs || !dst_abs)
3221 return -ENOMEM;
3222
3223 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3224 if (r < 0)
3225 return r;
3226 }
3227
3228 if (!exec_directory_is_private(context, dt))
3229 continue;
3230
3231 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3232 if (!private_path)
3233 return -ENOMEM;
3234
3235 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3236 if (!path)
3237 return -ENOMEM;
3238
3239 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3240 if (r < 0)
3241 return r;
3242 }
3243 }
3244
3245 *ret_symlinks = TAKE_PTR(symlinks);
3246
3247 return 0;
3248 }
3249
3250 static bool insist_on_sandboxing(
3251 const ExecContext *context,
3252 const char *root_dir,
3253 const char *root_image,
3254 const BindMount *bind_mounts,
3255 size_t n_bind_mounts) {
3256
3257 assert(context);
3258 assert(n_bind_mounts == 0 || bind_mounts);
3259
3260 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3261 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3262 * rearrange stuff in a way we cannot ignore gracefully. */
3263
3264 if (context->n_temporary_filesystems > 0)
3265 return true;
3266
3267 if (root_dir || root_image)
3268 return true;
3269
3270 if (context->n_mount_images > 0)
3271 return true;
3272
3273 if (context->dynamic_user)
3274 return true;
3275
3276 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3277 * essential. */
3278 for (size_t i = 0; i < n_bind_mounts; i++)
3279 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3280 return true;
3281
3282 if (context->log_namespace)
3283 return true;
3284
3285 return false;
3286 }
3287
3288 static int apply_mount_namespace(
3289 const Unit *u,
3290 ExecCommandFlags command_flags,
3291 const ExecContext *context,
3292 const ExecParameters *params,
3293 const ExecRuntime *runtime,
3294 char **error_path) {
3295
3296 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3297 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3298 const char *root_dir = NULL, *root_image = NULL;
3299 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
3300 NamespaceInfo ns_info;
3301 bool needs_sandboxing;
3302 BindMount *bind_mounts = NULL;
3303 size_t n_bind_mounts = 0;
3304 int r;
3305
3306 assert(context);
3307
3308 if (params->flags & EXEC_APPLY_CHROOT) {
3309 root_image = context->root_image;
3310
3311 if (!root_image)
3312 root_dir = context->root_directory;
3313 }
3314
3315 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3316 if (r < 0)
3317 return r;
3318
3319 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3320 r = compile_symlinks(context, params, &symlinks);
3321 if (r < 0)
3322 return r;
3323
3324 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3325 if (needs_sandboxing) {
3326 /* The runtime struct only contains the parent of the private /tmp,
3327 * which is non-accessible to world users. Inside of it there's a /tmp
3328 * that is sticky, and that's the one we want to use here.
3329 * This does not apply when we are using /run/systemd/empty as fallback. */
3330
3331 if (context->private_tmp && runtime) {
3332 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3333 tmp_dir = runtime->tmp_dir;
3334 else if (runtime->tmp_dir)
3335 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3336
3337 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3338 var_tmp_dir = runtime->var_tmp_dir;
3339 else if (runtime->var_tmp_dir)
3340 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3341 }
3342
3343 ns_info = (NamespaceInfo) {
3344 .ignore_protect_paths = false,
3345 .private_dev = context->private_devices,
3346 .protect_control_groups = context->protect_control_groups,
3347 .protect_kernel_tunables = context->protect_kernel_tunables,
3348 .protect_kernel_modules = context->protect_kernel_modules,
3349 .protect_kernel_logs = context->protect_kernel_logs,
3350 .protect_hostname = context->protect_hostname,
3351 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3352 .private_mounts = context->private_mounts,
3353 .protect_home = context->protect_home,
3354 .protect_system = context->protect_system,
3355 .protect_proc = context->protect_proc,
3356 .proc_subset = context->proc_subset,
3357 .private_ipc = context->private_ipc || context->ipc_namespace_path,
3358 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3359 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3360 };
3361 } else if (!context->dynamic_user && root_dir)
3362 /*
3363 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3364 * sandbox info, otherwise enforce it, don't ignore protected paths and
3365 * fail if we are enable to apply the sandbox inside the mount namespace.
3366 */
3367 ns_info = (NamespaceInfo) {
3368 .ignore_protect_paths = true,
3369 };
3370 else
3371 ns_info = (NamespaceInfo) {};
3372
3373 if (context->mount_flags == MS_SHARED)
3374 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3375
3376 if (exec_context_has_credentials(context) &&
3377 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3378 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3379 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3380 if (!creds_path) {
3381 r = -ENOMEM;
3382 goto finalize;
3383 }
3384 }
3385
3386 if (MANAGER_IS_SYSTEM(u->manager)) {
3387 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3388 if (!propagate_dir) {
3389 r = -ENOMEM;
3390 goto finalize;
3391 }
3392
3393 incoming_dir = strdup("/run/systemd/incoming");
3394 if (!incoming_dir) {
3395 r = -ENOMEM;
3396 goto finalize;
3397 }
3398 }
3399
3400 r = setup_namespace(root_dir, root_image, context->root_image_options,
3401 &ns_info, context->read_write_paths,
3402 needs_sandboxing ? context->read_only_paths : NULL,
3403 needs_sandboxing ? context->inaccessible_paths : NULL,
3404 needs_sandboxing ? context->exec_paths : NULL,
3405 needs_sandboxing ? context->no_exec_paths : NULL,
3406 empty_directories,
3407 symlinks,
3408 bind_mounts,
3409 n_bind_mounts,
3410 context->temporary_filesystems,
3411 context->n_temporary_filesystems,
3412 context->mount_images,
3413 context->n_mount_images,
3414 tmp_dir,
3415 var_tmp_dir,
3416 creds_path,
3417 context->log_namespace,
3418 context->mount_flags,
3419 context->root_hash, context->root_hash_size, context->root_hash_path,
3420 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3421 context->root_verity,
3422 context->extension_images,
3423 context->n_extension_images,
3424 propagate_dir,
3425 incoming_dir,
3426 root_dir || root_image ? params->notify_socket : NULL,
3427 error_path);
3428
3429 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3430 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3431 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3432 * completely different execution environment. */
3433 if (r == -ENOANO) {
3434 if (insist_on_sandboxing(
3435 context,
3436 root_dir, root_image,
3437 bind_mounts,
3438 n_bind_mounts)) {
3439 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3440 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3441 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3442
3443 r = -EOPNOTSUPP;
3444 } else {
3445 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3446 r = 0;
3447 }
3448 }
3449
3450 finalize:
3451 bind_mount_free_many(bind_mounts, n_bind_mounts);
3452 return r;
3453 }
3454
3455 static int apply_working_directory(
3456 const ExecContext *context,
3457 const ExecParameters *params,
3458 const char *home,
3459 int *exit_status) {
3460
3461 const char *d, *wd;
3462
3463 assert(context);
3464 assert(exit_status);
3465
3466 if (context->working_directory_home) {
3467
3468 if (!home) {
3469 *exit_status = EXIT_CHDIR;
3470 return -ENXIO;
3471 }
3472
3473 wd = home;
3474
3475 } else
3476 wd = empty_to_root(context->working_directory);
3477
3478 if (params->flags & EXEC_APPLY_CHROOT)
3479 d = wd;
3480 else
3481 d = prefix_roota(context->root_directory, wd);
3482
3483 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3484 *exit_status = EXIT_CHDIR;
3485 return -errno;
3486 }
3487
3488 return 0;
3489 }
3490
3491 static int apply_root_directory(
3492 const ExecContext *context,
3493 const ExecParameters *params,
3494 const bool needs_mount_ns,
3495 int *exit_status) {
3496
3497 assert(context);
3498 assert(exit_status);
3499
3500 if (params->flags & EXEC_APPLY_CHROOT)
3501 if (!needs_mount_ns && context->root_directory)
3502 if (chroot(context->root_directory) < 0) {
3503 *exit_status = EXIT_CHROOT;
3504 return -errno;
3505 }
3506
3507 return 0;
3508 }
3509
3510 static int setup_keyring(
3511 const Unit *u,
3512 const ExecContext *context,
3513 const ExecParameters *p,
3514 uid_t uid, gid_t gid) {
3515
3516 key_serial_t keyring;
3517 int r = 0;
3518 uid_t saved_uid;
3519 gid_t saved_gid;
3520
3521 assert(u);
3522 assert(context);
3523 assert(p);
3524
3525 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3526 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3527 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3528 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3529 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3530 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3531
3532 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3533 return 0;
3534
3535 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3536 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3537 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3538 * & group is just as nasty as acquiring a reference to the user keyring. */
3539
3540 saved_uid = getuid();
3541 saved_gid = getgid();
3542
3543 if (gid_is_valid(gid) && gid != saved_gid) {
3544 if (setregid(gid, -1) < 0)
3545 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3546 }
3547
3548 if (uid_is_valid(uid) && uid != saved_uid) {
3549 if (setreuid(uid, -1) < 0) {
3550 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3551 goto out;
3552 }
3553 }
3554
3555 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3556 if (keyring == -1) {
3557 if (errno == ENOSYS)
3558 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3559 else if (ERRNO_IS_PRIVILEGE(errno))
3560 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3561 else if (errno == EDQUOT)
3562 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3563 else
3564 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3565
3566 goto out;
3567 }
3568
3569 /* When requested link the user keyring into the session keyring. */
3570 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3571
3572 if (keyctl(KEYCTL_LINK,
3573 KEY_SPEC_USER_KEYRING,
3574 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3575 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3576 goto out;
3577 }
3578 }
3579
3580 /* Restore uid/gid back */
3581 if (uid_is_valid(uid) && uid != saved_uid) {
3582 if (setreuid(saved_uid, -1) < 0) {
3583 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3584 goto out;
3585 }
3586 }
3587
3588 if (gid_is_valid(gid) && gid != saved_gid) {
3589 if (setregid(saved_gid, -1) < 0)
3590 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3591 }
3592
3593 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3594 if (!sd_id128_is_null(u->invocation_id)) {
3595 key_serial_t key;
3596
3597 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3598 if (key == -1)
3599 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3600 else {
3601 if (keyctl(KEYCTL_SETPERM, key,
3602 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3603 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3604 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3605 }
3606 }
3607
3608 out:
3609 /* Revert back uid & gid for the last time, and exit */
3610 /* no extra logging, as only the first already reported error matters */
3611 if (getuid() != saved_uid)
3612 (void) setreuid(saved_uid, -1);
3613
3614 if (getgid() != saved_gid)
3615 (void) setregid(saved_gid, -1);
3616
3617 return r;
3618 }
3619
3620 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3621 assert(array);
3622 assert(n);
3623 assert(pair);
3624
3625 if (pair[0] >= 0)
3626 array[(*n)++] = pair[0];
3627 if (pair[1] >= 0)
3628 array[(*n)++] = pair[1];
3629 }
3630
3631 static int close_remaining_fds(
3632 const ExecParameters *params,
3633 const ExecRuntime *runtime,
3634 const DynamicCreds *dcreds,
3635 int user_lookup_fd,
3636 int socket_fd,
3637 const int *fds, size_t n_fds) {
3638
3639 size_t n_dont_close = 0;
3640 int dont_close[n_fds + 12];
3641
3642 assert(params);
3643
3644 if (params->stdin_fd >= 0)
3645 dont_close[n_dont_close++] = params->stdin_fd;
3646 if (params->stdout_fd >= 0)
3647 dont_close[n_dont_close++] = params->stdout_fd;
3648 if (params->stderr_fd >= 0)
3649 dont_close[n_dont_close++] = params->stderr_fd;
3650
3651 if (socket_fd >= 0)
3652 dont_close[n_dont_close++] = socket_fd;
3653 if (n_fds > 0) {
3654 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3655 n_dont_close += n_fds;
3656 }
3657
3658 if (runtime) {
3659 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3660 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3661 }
3662
3663 if (dcreds) {
3664 if (dcreds->user)
3665 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3666 if (dcreds->group)
3667 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3668 }
3669
3670 if (user_lookup_fd >= 0)
3671 dont_close[n_dont_close++] = user_lookup_fd;
3672
3673 return close_all_fds(dont_close, n_dont_close);
3674 }
3675
3676 static int send_user_lookup(
3677 Unit *unit,
3678 int user_lookup_fd,
3679 uid_t uid,
3680 gid_t gid) {
3681
3682 assert(unit);
3683
3684 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3685 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3686 * specified. */
3687
3688 if (user_lookup_fd < 0)
3689 return 0;
3690
3691 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3692 return 0;
3693
3694 if (writev(user_lookup_fd,
3695 (struct iovec[]) {
3696 IOVEC_INIT(&uid, sizeof(uid)),
3697 IOVEC_INIT(&gid, sizeof(gid)),
3698 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3699 return -errno;
3700
3701 return 0;
3702 }
3703
3704 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3705 int r;
3706
3707 assert(c);
3708 assert(home);
3709 assert(buf);
3710
3711 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3712
3713 if (*home)
3714 return 0;
3715
3716 if (!c->working_directory_home)
3717 return 0;
3718
3719 r = get_home_dir(buf);
3720 if (r < 0)
3721 return r;
3722
3723 *home = *buf;
3724 return 1;
3725 }
3726
3727 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3728 _cleanup_strv_free_ char ** list = NULL;
3729 int r;
3730
3731 assert(c);
3732 assert(p);
3733 assert(ret);
3734
3735 assert(c->dynamic_user);
3736
3737 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3738 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3739 * directories. */
3740
3741 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3742 if (t == EXEC_DIRECTORY_CONFIGURATION)
3743 continue;
3744
3745 if (!p->prefix[t])
3746 continue;
3747
3748 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3749 char *e;
3750
3751 if (exec_directory_is_private(c, t))
3752 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3753 else
3754 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3755 if (!e)
3756 return -ENOMEM;
3757
3758 r = strv_consume(&list, e);
3759 if (r < 0)
3760 return r;
3761 }
3762 }
3763
3764 *ret = TAKE_PTR(list);
3765
3766 return 0;
3767 }
3768
3769 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3770 bool using_subcgroup;
3771 char *p;
3772
3773 assert(params);
3774 assert(ret);
3775
3776 if (!params->cgroup_path)
3777 return -EINVAL;
3778
3779 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3780 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3781 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3782 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3783 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3784 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3785 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3786 * flag, which is only passed for the former statements, not for the latter. */
3787
3788 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3789 if (using_subcgroup)
3790 p = path_join(params->cgroup_path, ".control");
3791 else
3792 p = strdup(params->cgroup_path);
3793 if (!p)
3794 return -ENOMEM;
3795
3796 *ret = p;
3797 return using_subcgroup;
3798 }
3799
3800 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3801 _cleanup_(cpu_set_reset) CPUSet s = {};
3802 int r;
3803
3804 assert(c);
3805 assert(ret);
3806
3807 if (!c->numa_policy.nodes.set) {
3808 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3809 return 0;
3810 }
3811
3812 r = numa_to_cpu_set(&c->numa_policy, &s);
3813 if (r < 0)
3814 return r;
3815
3816 cpu_set_reset(ret);
3817
3818 return cpu_set_add_all(ret, &s);
3819 }
3820
3821 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3822 assert(c);
3823
3824 return c->cpu_affinity_from_numa;
3825 }
3826
3827 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3828 int r;
3829
3830 assert(fds);
3831 assert(n_fds);
3832 assert(*n_fds < fds_size);
3833 assert(ret_fd);
3834
3835 if (fd < 0) {
3836 *ret_fd = -1;
3837 return 0;
3838 }
3839
3840 if (fd < 3 + (int) *n_fds) {
3841 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3842 * the fds we pass to the process (or which are closed only during execve). */
3843
3844 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3845 if (r < 0)
3846 return -errno;
3847
3848 CLOSE_AND_REPLACE(fd, r);
3849 }
3850
3851 *ret_fd = fds[*n_fds] = fd;
3852 (*n_fds) ++;
3853 return 1;
3854 }
3855
3856 static int exec_child(
3857 Unit *unit,
3858 const ExecCommand *command,
3859 const ExecContext *context,
3860 const ExecParameters *params,
3861 ExecRuntime *runtime,
3862 DynamicCreds *dcreds,
3863 int socket_fd,
3864 const int named_iofds[static 3],
3865 int *fds,
3866 size_t n_socket_fds,
3867 size_t n_storage_fds,
3868 char **files_env,
3869 int user_lookup_fd,
3870 int *exit_status) {
3871
3872 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3873 int r, ngids = 0, exec_fd;
3874 _cleanup_free_ gid_t *supplementary_gids = NULL;
3875 const char *username = NULL, *groupname = NULL;
3876 _cleanup_free_ char *home_buffer = NULL;
3877 const char *home = NULL, *shell = NULL;
3878 char **final_argv = NULL;
3879 dev_t journal_stream_dev = 0;
3880 ino_t journal_stream_ino = 0;
3881 bool userns_set_up = false;
3882 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3883 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3884 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3885 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3886 #if HAVE_SELINUX
3887 _cleanup_free_ char *mac_selinux_context_net = NULL;
3888 bool use_selinux = false;
3889 #endif
3890 #if ENABLE_SMACK
3891 bool use_smack = false;
3892 #endif
3893 #if HAVE_APPARMOR
3894 bool use_apparmor = false;
3895 #endif
3896 uid_t saved_uid = getuid();
3897 gid_t saved_gid = getgid();
3898 uid_t uid = UID_INVALID;
3899 gid_t gid = GID_INVALID;
3900 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3901 n_keep_fds; /* total number of fds not to close */
3902 int secure_bits;
3903 _cleanup_free_ gid_t *gids_after_pam = NULL;
3904 int ngids_after_pam = 0;
3905
3906 assert(unit);
3907 assert(command);
3908 assert(context);
3909 assert(params);
3910 assert(exit_status);
3911
3912 rename_process_from_path(command->path);
3913
3914 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3915 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3916 * both of which will be demoted to SIG_DFL. */
3917 (void) default_signals(SIGNALS_CRASH_HANDLER,
3918 SIGNALS_IGNORE);
3919
3920 if (context->ignore_sigpipe)
3921 (void) ignore_signals(SIGPIPE);
3922
3923 r = reset_signal_mask();
3924 if (r < 0) {
3925 *exit_status = EXIT_SIGNAL_MASK;
3926 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3927 }
3928
3929 if (params->idle_pipe)
3930 do_idle_pipe_dance(params->idle_pipe);
3931
3932 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3933 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3934 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3935 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3936
3937 log_forget_fds();
3938 log_set_open_when_needed(true);
3939
3940 /* In case anything used libc syslog(), close this here, too */
3941 closelog();
3942
3943 int keep_fds[n_fds + 3];
3944 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3945 n_keep_fds = n_fds;
3946
3947 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3948 if (r < 0) {
3949 *exit_status = EXIT_FDS;
3950 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3951 }
3952
3953 #if HAVE_LIBBPF
3954 if (MANAGER_IS_SYSTEM(unit->manager) && lsm_bpf_supported()) {
3955 int bpf_map_fd = -1;
3956
3957 bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
3958 if (bpf_map_fd < 0) {
3959 *exit_status = EXIT_FDS;
3960 return log_unit_error_errno(unit, r, "Failed to get restrict filesystems BPF map fd: %m");
3961 }
3962
3963 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
3964 if (r < 0) {
3965 *exit_status = EXIT_FDS;
3966 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3967 }
3968 }
3969 #endif
3970
3971 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
3972 if (r < 0) {
3973 *exit_status = EXIT_FDS;
3974 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3975 }
3976
3977 if (!context->same_pgrp &&
3978 setsid() < 0) {
3979 *exit_status = EXIT_SETSID;
3980 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3981 }
3982
3983 exec_context_tty_reset(context, params);
3984
3985 if (unit_shall_confirm_spawn(unit)) {
3986 const char *vc = params->confirm_spawn;
3987 _cleanup_free_ char *cmdline = NULL;
3988
3989 cmdline = quote_command_line(command->argv);
3990 if (!cmdline) {
3991 *exit_status = EXIT_MEMORY;
3992 return log_oom();
3993 }
3994
3995 r = ask_for_confirmation(vc, unit, cmdline);
3996 if (r != CONFIRM_EXECUTE) {
3997 if (r == CONFIRM_PRETEND_SUCCESS) {
3998 *exit_status = EXIT_SUCCESS;
3999 return 0;
4000 }
4001 *exit_status = EXIT_CONFIRM;
4002 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4003 "Execution cancelled by the user");
4004 }
4005 }
4006
4007 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4008 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4009 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4010 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4011 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4012 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4013 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4014 *exit_status = EXIT_MEMORY;
4015 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4016 }
4017
4018 if (context->dynamic_user && dcreds) {
4019 _cleanup_strv_free_ char **suggested_paths = NULL;
4020
4021 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4022 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4023 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4024 *exit_status = EXIT_USER;
4025 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4026 }
4027
4028 r = compile_suggested_paths(context, params, &suggested_paths);
4029 if (r < 0) {
4030 *exit_status = EXIT_MEMORY;
4031 return log_oom();
4032 }
4033
4034 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4035 if (r < 0) {
4036 *exit_status = EXIT_USER;
4037 if (r == -EILSEQ)
4038 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4039 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4040 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4041 }
4042
4043 if (!uid_is_valid(uid)) {
4044 *exit_status = EXIT_USER;
4045 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4046 }
4047
4048 if (!gid_is_valid(gid)) {
4049 *exit_status = EXIT_USER;
4050 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4051 }
4052
4053 if (dcreds->user)
4054 username = dcreds->user->name;
4055
4056 } else {
4057 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4058 if (r < 0) {
4059 *exit_status = EXIT_USER;
4060 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4061 }
4062
4063 r = get_fixed_group(context, &groupname, &gid);
4064 if (r < 0) {
4065 *exit_status = EXIT_GROUP;
4066 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4067 }
4068 }
4069
4070 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4071 r = get_supplementary_groups(context, username, groupname, gid,
4072 &supplementary_gids, &ngids);
4073 if (r < 0) {
4074 *exit_status = EXIT_GROUP;
4075 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4076 }
4077
4078 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4079 if (r < 0) {
4080 *exit_status = EXIT_USER;
4081 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4082 }
4083
4084 user_lookup_fd = safe_close(user_lookup_fd);
4085
4086 r = acquire_home(context, uid, &home, &home_buffer);
4087 if (r < 0) {
4088 *exit_status = EXIT_CHDIR;
4089 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4090 }
4091
4092 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4093 * must sure to drop O_NONBLOCK */
4094 if (socket_fd >= 0)
4095 (void) fd_nonblock(socket_fd, false);
4096
4097 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4098 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4099 if (params->cgroup_path) {
4100 _cleanup_free_ char *p = NULL;
4101
4102 r = exec_parameters_get_cgroup_path(params, &p);
4103 if (r < 0) {
4104 *exit_status = EXIT_CGROUP;
4105 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4106 }
4107
4108 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4109 if (r < 0) {
4110 *exit_status = EXIT_CGROUP;
4111 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4112 }
4113 }
4114
4115 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4116 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4117 if (r < 0) {
4118 *exit_status = EXIT_NETWORK;
4119 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4120 }
4121 }
4122
4123 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4124 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4125 if (r < 0) {
4126 *exit_status = EXIT_NAMESPACE;
4127 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4128 }
4129 }
4130
4131 r = setup_input(context, params, socket_fd, named_iofds);
4132 if (r < 0) {
4133 *exit_status = EXIT_STDIN;
4134 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4135 }
4136
4137 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4138 if (r < 0) {
4139 *exit_status = EXIT_STDOUT;
4140 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4141 }
4142
4143 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4144 if (r < 0) {
4145 *exit_status = EXIT_STDERR;
4146 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4147 }
4148
4149 if (context->oom_score_adjust_set) {
4150 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4151 * prohibit write access to this file, and we shouldn't trip up over that. */
4152 r = set_oom_score_adjust(context->oom_score_adjust);
4153 if (ERRNO_IS_PRIVILEGE(r))
4154 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4155 else if (r < 0) {
4156 *exit_status = EXIT_OOM_ADJUST;
4157 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4158 }
4159 }
4160
4161 if (context->coredump_filter_set) {
4162 r = set_coredump_filter(context->coredump_filter);
4163 if (ERRNO_IS_PRIVILEGE(r))
4164 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4165 else if (r < 0)
4166 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4167 }
4168
4169 if (context->nice_set) {
4170 r = setpriority_closest(context->nice);
4171 if (r < 0)
4172 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4173 }
4174
4175 if (context->cpu_sched_set) {
4176 struct sched_param param = {
4177 .sched_priority = context->cpu_sched_priority,
4178 };
4179
4180 r = sched_setscheduler(0,
4181 context->cpu_sched_policy |
4182 (context->cpu_sched_reset_on_fork ?
4183 SCHED_RESET_ON_FORK : 0),
4184 &param);
4185 if (r < 0) {
4186 *exit_status = EXIT_SETSCHEDULER;
4187 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4188 }
4189 }
4190
4191 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4192 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4193 const CPUSet *cpu_set;
4194
4195 if (context->cpu_affinity_from_numa) {
4196 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4197 if (r < 0) {
4198 *exit_status = EXIT_CPUAFFINITY;
4199 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4200 }
4201
4202 cpu_set = &converted_cpu_set;
4203 } else
4204 cpu_set = &context->cpu_set;
4205
4206 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4207 *exit_status = EXIT_CPUAFFINITY;
4208 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4209 }
4210 }
4211
4212 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4213 r = apply_numa_policy(&context->numa_policy);
4214 if (r == -EOPNOTSUPP)
4215 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4216 else if (r < 0) {
4217 *exit_status = EXIT_NUMA_POLICY;
4218 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4219 }
4220 }
4221
4222 if (context->ioprio_set)
4223 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4224 *exit_status = EXIT_IOPRIO;
4225 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4226 }
4227
4228 if (context->timer_slack_nsec != NSEC_INFINITY)
4229 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4230 *exit_status = EXIT_TIMERSLACK;
4231 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4232 }
4233
4234 if (context->personality != PERSONALITY_INVALID) {
4235 r = safe_personality(context->personality);
4236 if (r < 0) {
4237 *exit_status = EXIT_PERSONALITY;
4238 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4239 }
4240 }
4241
4242 if (context->utmp_id) {
4243 const char *line = context->tty_path ?
4244 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4245 NULL;
4246 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4247 line,
4248 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4249 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4250 USER_PROCESS,
4251 username);
4252 }
4253
4254 if (uid_is_valid(uid)) {
4255 r = chown_terminal(STDIN_FILENO, uid);
4256 if (r < 0) {
4257 *exit_status = EXIT_STDIN;
4258 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4259 }
4260 }
4261
4262 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4263 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4264 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4265 * touch a single hierarchy too. */
4266 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4267 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4268 if (r < 0) {
4269 *exit_status = EXIT_CGROUP;
4270 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4271 }
4272 }
4273
4274 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4275
4276 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4277 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4278 if (r < 0)
4279 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4280 }
4281
4282 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4283 r = setup_credentials(context, params, unit->id, uid);
4284 if (r < 0) {
4285 *exit_status = EXIT_CREDENTIALS;
4286 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4287 }
4288 }
4289
4290 r = build_environment(
4291 unit,
4292 context,
4293 params,
4294 n_fds,
4295 home,
4296 username,
4297 shell,
4298 journal_stream_dev,
4299 journal_stream_ino,
4300 &our_env);
4301 if (r < 0) {
4302 *exit_status = EXIT_MEMORY;
4303 return log_oom();
4304 }
4305
4306 r = build_pass_environment(context, &pass_env);
4307 if (r < 0) {
4308 *exit_status = EXIT_MEMORY;
4309 return log_oom();
4310 }
4311
4312 /* The PATH variable is set to the default path in params->environment.
4313 * However, this is overridden if user specified fields have PATH set.
4314 * The intention is to also override PATH if the user does
4315 * not specify PATH and the user has specified ExecSearchPath
4316 */
4317
4318 if (!strv_isempty(context->exec_search_path)) {
4319 _cleanup_free_ char *joined = NULL;
4320
4321 joined = strv_join(context->exec_search_path, ":");
4322 if (!joined) {
4323 *exit_status = EXIT_MEMORY;
4324 return log_oom();
4325 }
4326
4327 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4328 if (r < 0) {
4329 *exit_status = EXIT_MEMORY;
4330 return log_oom();
4331 }
4332 }
4333
4334 accum_env = strv_env_merge(params->environment,
4335 our_env,
4336 joined_exec_search_path,
4337 pass_env,
4338 context->environment,
4339 files_env);
4340 if (!accum_env) {
4341 *exit_status = EXIT_MEMORY;
4342 return log_oom();
4343 }
4344 accum_env = strv_env_clean(accum_env);
4345
4346 (void) umask(context->umask);
4347
4348 r = setup_keyring(unit, context, params, uid, gid);
4349 if (r < 0) {
4350 *exit_status = EXIT_KEYRING;
4351 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4352 }
4353
4354 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
4355 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4356
4357 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4358 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4359
4360 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4361 if (needs_ambient_hack)
4362 needs_setuid = false;
4363 else
4364 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4365
4366 if (needs_sandboxing) {
4367 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4368 * present. The actual MAC context application will happen later, as late as possible, to avoid
4369 * impacting our own code paths. */
4370
4371 #if HAVE_SELINUX
4372 use_selinux = mac_selinux_use();
4373 #endif
4374 #if ENABLE_SMACK
4375 use_smack = mac_smack_use();
4376 #endif
4377 #if HAVE_APPARMOR
4378 use_apparmor = mac_apparmor_use();
4379 #endif
4380 }
4381
4382 if (needs_sandboxing) {
4383 int which_failed;
4384
4385 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4386 * is set here. (See below.) */
4387
4388 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4389 if (r < 0) {
4390 *exit_status = EXIT_LIMITS;
4391 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4392 }
4393 }
4394
4395 if (needs_setuid && context->pam_name && username) {
4396 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4397 * wins here. (See above.) */
4398
4399 /* All fds passed in the fds array will be closed in the pam child process. */
4400 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4401 if (r < 0) {
4402 *exit_status = EXIT_PAM;
4403 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4404 }
4405
4406 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4407 if (ngids_after_pam < 0) {
4408 *exit_status = EXIT_MEMORY;
4409 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4410 }
4411 }
4412
4413 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4414 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4415 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4416 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4417
4418 userns_set_up = true;
4419 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4420 if (r < 0) {
4421 *exit_status = EXIT_USER;
4422 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4423 }
4424 }
4425
4426 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4427
4428 if (ns_type_supported(NAMESPACE_NET)) {
4429 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4430 if (r == -EPERM)
4431 log_unit_warning_errno(unit, r,
4432 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4433 else if (r < 0) {
4434 *exit_status = EXIT_NETWORK;
4435 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4436 }
4437 } else if (context->network_namespace_path) {
4438 *exit_status = EXIT_NETWORK;
4439 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4440 "NetworkNamespacePath= is not supported, refusing.");
4441 } else
4442 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4443 }
4444
4445 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4446
4447 if (ns_type_supported(NAMESPACE_IPC)) {
4448 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4449 if (r == -EPERM)
4450 log_unit_warning_errno(unit, r,
4451 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4452 else if (r < 0) {
4453 *exit_status = EXIT_NAMESPACE;
4454 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4455 }
4456 } else if (context->ipc_namespace_path) {
4457 *exit_status = EXIT_NAMESPACE;
4458 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4459 "IPCNamespacePath= is not supported, refusing.");
4460 } else
4461 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4462 }
4463
4464 if (needs_mount_namespace) {
4465 _cleanup_free_ char *error_path = NULL;
4466
4467 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4468 if (r < 0) {
4469 *exit_status = EXIT_NAMESPACE;
4470 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4471 error_path ? ": " : "", strempty(error_path));
4472 }
4473 }
4474
4475 if (needs_sandboxing) {
4476 r = apply_protect_hostname(unit, context, exit_status);
4477 if (r < 0)
4478 return r;
4479 }
4480
4481 /* Drop groups as early as possible.
4482 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4483 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4484 if (needs_setuid) {
4485 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4486 int ngids_to_enforce = 0;
4487
4488 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4489 ngids,
4490 gids_after_pam,
4491 ngids_after_pam,
4492 &gids_to_enforce);
4493 if (ngids_to_enforce < 0) {
4494 *exit_status = EXIT_MEMORY;
4495 return log_unit_error_errno(unit,
4496 ngids_to_enforce,
4497 "Failed to merge group lists. Group membership might be incorrect: %m");
4498 }
4499
4500 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4501 if (r < 0) {
4502 *exit_status = EXIT_GROUP;
4503 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4504 }
4505 }
4506
4507 /* If the user namespace was not set up above, try to do it now.
4508 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4509 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4510 * case of mount namespaces being less privileged when the mount point list is copied from a
4511 * different user namespace). */
4512
4513 if (needs_sandboxing && context->private_users && !userns_set_up) {
4514 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4515 if (r < 0) {
4516 *exit_status = EXIT_USER;
4517 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4518 }
4519 }
4520
4521 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4522 * shall execute. */
4523
4524 _cleanup_free_ char *executable = NULL;
4525 _cleanup_close_ int executable_fd = -1;
4526 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4527 if (r < 0) {
4528 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4529 log_unit_struct_errno(unit, LOG_INFO, r,
4530 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4531 LOG_UNIT_INVOCATION_ID(unit),
4532 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4533 command->path),
4534 "EXECUTABLE=%s", command->path);
4535 return 0;
4536 }
4537
4538 *exit_status = EXIT_EXEC;
4539
4540 return log_unit_struct_errno(unit, LOG_INFO, r,
4541 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4542 LOG_UNIT_INVOCATION_ID(unit),
4543 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4544 command->path),
4545 "EXECUTABLE=%s", command->path);
4546 }
4547
4548 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4549 if (r < 0) {
4550 *exit_status = EXIT_FDS;
4551 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4552 }
4553
4554 #if HAVE_SELINUX
4555 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4556 int fd = -1;
4557
4558 if (socket_fd >= 0)
4559 fd = socket_fd;
4560 else if (params->n_socket_fds == 1)
4561 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4562 * use context from that fd to compute the label. */
4563 fd = params->fds[0];
4564
4565 if (fd >= 0) {
4566 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4567 if (r < 0) {
4568 *exit_status = EXIT_SELINUX_CONTEXT;
4569 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4570 }
4571 }
4572 }
4573 #endif
4574
4575 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4576 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4577 * however if we have it as we want to keep it open until the final execve(). */
4578
4579 r = close_all_fds(keep_fds, n_keep_fds);
4580 if (r >= 0)
4581 r = shift_fds(fds, n_fds);
4582 if (r >= 0)
4583 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4584 if (r < 0) {
4585 *exit_status = EXIT_FDS;
4586 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4587 }
4588
4589 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4590 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4591 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4592 * came this far. */
4593
4594 secure_bits = context->secure_bits;
4595
4596 if (needs_sandboxing) {
4597 uint64_t bset;
4598
4599 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4600 * requested. (Note this is placed after the general resource limit initialization, see
4601 * above, in order to take precedence.) */
4602 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4603 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4604 *exit_status = EXIT_LIMITS;
4605 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4606 }
4607 }
4608
4609 #if ENABLE_SMACK
4610 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4611 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4612 if (use_smack) {
4613 r = setup_smack(context, executable_fd);
4614 if (r < 0) {
4615 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4616 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4617 }
4618 }
4619 #endif
4620
4621 bset = context->capability_bounding_set;
4622 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4623 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4624 * instead of us doing that */
4625 if (needs_ambient_hack)
4626 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4627 (UINT64_C(1) << CAP_SETUID) |
4628 (UINT64_C(1) << CAP_SETGID);
4629
4630 if (!cap_test_all(bset)) {
4631 r = capability_bounding_set_drop(bset, false);
4632 if (r < 0) {
4633 *exit_status = EXIT_CAPABILITIES;
4634 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4635 }
4636 }
4637
4638 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4639 * keep-caps set.
4640 * To be able to raise the ambient capabilities after setresuid() they have to be
4641 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4642 * After setresuid() the ambient capabilities can be raised as they are present in
4643 * the permitted and inhertiable set. However it is possible that someone wants to
4644 * set ambient capabilities without changing the user, so we also set the ambient
4645 * capabilities here.
4646 * The requested ambient capabilities are raised in the inheritable set if the
4647 * second argument is true. */
4648 if (!needs_ambient_hack) {
4649 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4650 if (r < 0) {
4651 *exit_status = EXIT_CAPABILITIES;
4652 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4653 }
4654 }
4655 }
4656
4657 /* chroot to root directory first, before we lose the ability to chroot */
4658 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4659 if (r < 0)
4660 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4661
4662 if (needs_setuid) {
4663 if (uid_is_valid(uid)) {
4664 r = enforce_user(context, uid);
4665 if (r < 0) {
4666 *exit_status = EXIT_USER;
4667 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4668 }
4669
4670 if (!needs_ambient_hack &&
4671 context->capability_ambient_set != 0) {
4672
4673 /* Raise the ambient capabilities after user change. */
4674 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4675 if (r < 0) {
4676 *exit_status = EXIT_CAPABILITIES;
4677 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4678 }
4679 }
4680 }
4681 }
4682
4683 /* Apply working directory here, because the working directory might be on NFS and only the user running
4684 * this service might have the correct privilege to change to the working directory */
4685 r = apply_working_directory(context, params, home, exit_status);
4686 if (r < 0)
4687 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4688
4689 if (needs_sandboxing) {
4690 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4691 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4692 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4693 * are restricted. */
4694
4695 #if HAVE_SELINUX
4696 if (use_selinux) {
4697 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4698
4699 if (exec_context) {
4700 r = setexeccon(exec_context);
4701 if (r < 0) {
4702 *exit_status = EXIT_SELINUX_CONTEXT;
4703 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4704 }
4705 }
4706 }
4707 #endif
4708
4709 #if HAVE_APPARMOR
4710 if (use_apparmor && context->apparmor_profile) {
4711 r = aa_change_onexec(context->apparmor_profile);
4712 if (r < 0 && !context->apparmor_profile_ignore) {
4713 *exit_status = EXIT_APPARMOR_PROFILE;
4714 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4715 }
4716 }
4717 #endif
4718
4719 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4720 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4721 * CAP_SETPCAP. */
4722 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4723 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4724 * effective set here.
4725 * The effective set is overwritten during execve with the following values:
4726 * - ambient set (for non-root processes)
4727 * - (inheritable | bounding) set for root processes)
4728 *
4729 * Hence there is no security impact to raise it in the effective set before execve
4730 */
4731 r = capability_gain_cap_setpcap(NULL);
4732 if (r < 0) {
4733 *exit_status = EXIT_CAPABILITIES;
4734 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4735 }
4736 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4737 *exit_status = EXIT_SECUREBITS;
4738 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4739 }
4740 }
4741
4742 if (context_has_no_new_privileges(context))
4743 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4744 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4745 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4746 }
4747
4748 #if HAVE_SECCOMP
4749 r = apply_address_families(unit, context);
4750 if (r < 0) {
4751 *exit_status = EXIT_ADDRESS_FAMILIES;
4752 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4753 }
4754
4755 r = apply_memory_deny_write_execute(unit, context);
4756 if (r < 0) {
4757 *exit_status = EXIT_SECCOMP;
4758 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4759 }
4760
4761 r = apply_restrict_realtime(unit, context);
4762 if (r < 0) {
4763 *exit_status = EXIT_SECCOMP;
4764 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4765 }
4766
4767 r = apply_restrict_suid_sgid(unit, context);
4768 if (r < 0) {
4769 *exit_status = EXIT_SECCOMP;
4770 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4771 }
4772
4773 r = apply_restrict_namespaces(unit, context);
4774 if (r < 0) {
4775 *exit_status = EXIT_SECCOMP;
4776 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4777 }
4778
4779 r = apply_protect_sysctl(unit, context);
4780 if (r < 0) {
4781 *exit_status = EXIT_SECCOMP;
4782 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4783 }
4784
4785 r = apply_protect_kernel_modules(unit, context);
4786 if (r < 0) {
4787 *exit_status = EXIT_SECCOMP;
4788 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4789 }
4790
4791 r = apply_protect_kernel_logs(unit, context);
4792 if (r < 0) {
4793 *exit_status = EXIT_SECCOMP;
4794 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4795 }
4796
4797 r = apply_protect_clock(unit, context);
4798 if (r < 0) {
4799 *exit_status = EXIT_SECCOMP;
4800 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4801 }
4802
4803 r = apply_private_devices(unit, context);
4804 if (r < 0) {
4805 *exit_status = EXIT_SECCOMP;
4806 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
4807 }
4808
4809 r = apply_syscall_archs(unit, context);
4810 if (r < 0) {
4811 *exit_status = EXIT_SECCOMP;
4812 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
4813 }
4814
4815 r = apply_lock_personality(unit, context);
4816 if (r < 0) {
4817 *exit_status = EXIT_SECCOMP;
4818 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
4819 }
4820
4821 r = apply_syscall_log(unit, context);
4822 if (r < 0) {
4823 *exit_status = EXIT_SECCOMP;
4824 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4825 }
4826
4827 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4828 * by the filter as little as possible. */
4829 r = apply_syscall_filter(unit, context, needs_ambient_hack);
4830 if (r < 0) {
4831 *exit_status = EXIT_SECCOMP;
4832 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
4833 }
4834 #endif
4835
4836 #if HAVE_LIBBPF
4837 r = apply_restrict_filesystems(unit, context);
4838 if (r < 0) {
4839 *exit_status = EXIT_BPF;
4840 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
4841 }
4842 #endif
4843
4844 }
4845
4846 if (!strv_isempty(context->unset_environment)) {
4847 char **ee = NULL;
4848
4849 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4850 if (!ee) {
4851 *exit_status = EXIT_MEMORY;
4852 return log_oom();
4853 }
4854
4855 strv_free_and_replace(accum_env, ee);
4856 }
4857
4858 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4859 replaced_argv = replace_env_argv(command->argv, accum_env);
4860 if (!replaced_argv) {
4861 *exit_status = EXIT_MEMORY;
4862 return log_oom();
4863 }
4864 final_argv = replaced_argv;
4865 } else
4866 final_argv = command->argv;
4867
4868 if (DEBUG_LOGGING) {
4869 _cleanup_free_ char *line = NULL;
4870
4871 line = quote_command_line(final_argv);
4872 if (!line) {
4873 *exit_status = EXIT_MEMORY;
4874 return log_oom();
4875 }
4876
4877 log_unit_struct(unit, LOG_DEBUG,
4878 "EXECUTABLE=%s", executable,
4879 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
4880 }
4881
4882 if (exec_fd >= 0) {
4883 uint8_t hot = 1;
4884
4885 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4886 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4887
4888 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4889 *exit_status = EXIT_EXEC;
4890 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4891 }
4892 }
4893
4894 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
4895
4896 if (exec_fd >= 0) {
4897 uint8_t hot = 0;
4898
4899 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4900 * that POLLHUP on it no longer means execve() succeeded. */
4901
4902 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4903 *exit_status = EXIT_EXEC;
4904 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4905 }
4906 }
4907
4908 *exit_status = EXIT_EXEC;
4909 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
4910 }
4911
4912 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4913 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4914
4915 int exec_spawn(Unit *unit,
4916 ExecCommand *command,
4917 const ExecContext *context,
4918 const ExecParameters *params,
4919 ExecRuntime *runtime,
4920 DynamicCreds *dcreds,
4921 pid_t *ret) {
4922
4923 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4924 _cleanup_free_ char *subcgroup_path = NULL;
4925 _cleanup_strv_free_ char **files_env = NULL;
4926 size_t n_storage_fds = 0, n_socket_fds = 0;
4927 _cleanup_free_ char *line = NULL;
4928 pid_t pid;
4929
4930 assert(unit);
4931 assert(command);
4932 assert(context);
4933 assert(ret);
4934 assert(params);
4935 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4936
4937 if (context->std_input == EXEC_INPUT_SOCKET ||
4938 context->std_output == EXEC_OUTPUT_SOCKET ||
4939 context->std_error == EXEC_OUTPUT_SOCKET) {
4940
4941 if (params->n_socket_fds > 1)
4942 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4943
4944 if (params->n_socket_fds == 0)
4945 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4946
4947 socket_fd = params->fds[0];
4948 } else {
4949 socket_fd = -1;
4950 fds = params->fds;
4951 n_socket_fds = params->n_socket_fds;
4952 n_storage_fds = params->n_storage_fds;
4953 }
4954
4955 r = exec_context_named_iofds(context, params, named_iofds);
4956 if (r < 0)
4957 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4958
4959 r = exec_context_load_environment(unit, context, &files_env);
4960 if (r < 0)
4961 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4962
4963 line = quote_command_line(command->argv);
4964 if (!line)
4965 return log_oom();
4966
4967 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4968 and, until the next SELinux policy changes, we save further reloads in future children. */
4969 mac_selinux_maybe_reload();
4970
4971 log_unit_struct(unit, LOG_DEBUG,
4972 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4973 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4974 the mount namespace in the child, but we want to log
4975 from the parent, so we need to use the (possibly
4976 inaccurate) path here. */
4977 LOG_UNIT_INVOCATION_ID(unit));
4978
4979 if (params->cgroup_path) {
4980 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4981 if (r < 0)
4982 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4983 if (r > 0) { /* We are using a child cgroup */
4984 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4985 if (r < 0)
4986 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4987
4988 /* Normally we would not propagate the oomd xattrs to children but since we created this
4989 * sub-cgroup internally we should do it. */
4990 cgroup_oomd_xattr_apply(unit, subcgroup_path);
4991 }
4992 }
4993
4994 pid = fork();
4995 if (pid < 0)
4996 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4997
4998 if (pid == 0) {
4999 int exit_status = EXIT_SUCCESS;
5000
5001 r = exec_child(unit,
5002 command,
5003 context,
5004 params,
5005 runtime,
5006 dcreds,
5007 socket_fd,
5008 named_iofds,
5009 fds,
5010 n_socket_fds,
5011 n_storage_fds,
5012 files_env,
5013 unit->manager->user_lookup_fds[1],
5014 &exit_status);
5015
5016 if (r < 0) {
5017 const char *status =
5018 exit_status_to_string(exit_status,
5019 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5020
5021 log_unit_struct_errno(unit, LOG_ERR, r,
5022 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5023 LOG_UNIT_INVOCATION_ID(unit),
5024 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5025 status, command->path),
5026 "EXECUTABLE=%s", command->path);
5027 }
5028
5029 _exit(exit_status);
5030 }
5031
5032 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5033
5034 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5035 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5036 * process will be killed too). */
5037 if (subcgroup_path)
5038 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5039
5040 exec_status_start(&command->exec_status, pid);
5041
5042 *ret = pid;
5043 return 0;
5044 }
5045
5046 void exec_context_init(ExecContext *c) {
5047 assert(c);
5048
5049 c->umask = 0022;
5050 c->ioprio = ioprio_prio_value(IOPRIO_CLASS_BE, 0);
5051 c->cpu_sched_policy = SCHED_OTHER;
5052 c->syslog_priority = LOG_DAEMON|LOG_INFO;
5053 c->syslog_level_prefix = true;
5054 c->ignore_sigpipe = true;
5055 c->timer_slack_nsec = NSEC_INFINITY;
5056 c->personality = PERSONALITY_INVALID;
5057 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5058 c->directories[t].mode = 0755;
5059 c->timeout_clean_usec = USEC_INFINITY;
5060 c->capability_bounding_set = CAP_ALL;
5061 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5062 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5063 c->log_level_max = -1;
5064 #if HAVE_SECCOMP
5065 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5066 #endif
5067 numa_policy_reset(&c->numa_policy);
5068 }
5069
5070 void exec_context_done(ExecContext *c) {
5071 assert(c);
5072
5073 c->environment = strv_free(c->environment);
5074 c->environment_files = strv_free(c->environment_files);
5075 c->pass_environment = strv_free(c->pass_environment);
5076 c->unset_environment = strv_free(c->unset_environment);
5077
5078 rlimit_free_all(c->rlimit);
5079
5080 for (size_t l = 0; l < 3; l++) {
5081 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5082 c->stdio_file[l] = mfree(c->stdio_file[l]);
5083 }
5084
5085 c->working_directory = mfree(c->working_directory);
5086 c->root_directory = mfree(c->root_directory);
5087 c->root_image = mfree(c->root_image);
5088 c->root_image_options = mount_options_free_all(c->root_image_options);
5089 c->root_hash = mfree(c->root_hash);
5090 c->root_hash_size = 0;
5091 c->root_hash_path = mfree(c->root_hash_path);
5092 c->root_hash_sig = mfree(c->root_hash_sig);
5093 c->root_hash_sig_size = 0;
5094 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5095 c->root_verity = mfree(c->root_verity);
5096 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5097 c->tty_path = mfree(c->tty_path);
5098 c->syslog_identifier = mfree(c->syslog_identifier);
5099 c->user = mfree(c->user);
5100 c->group = mfree(c->group);
5101
5102 c->supplementary_groups = strv_free(c->supplementary_groups);
5103
5104 c->pam_name = mfree(c->pam_name);
5105
5106 c->read_only_paths = strv_free(c->read_only_paths);
5107 c->read_write_paths = strv_free(c->read_write_paths);
5108 c->inaccessible_paths = strv_free(c->inaccessible_paths);
5109 c->exec_paths = strv_free(c->exec_paths);
5110 c->no_exec_paths = strv_free(c->no_exec_paths);
5111 c->exec_search_path = strv_free(c->exec_search_path);
5112
5113 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5114 c->bind_mounts = NULL;
5115 c->n_bind_mounts = 0;
5116 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5117 c->temporary_filesystems = NULL;
5118 c->n_temporary_filesystems = 0;
5119 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5120
5121 cpu_set_reset(&c->cpu_set);
5122 numa_policy_reset(&c->numa_policy);
5123
5124 c->utmp_id = mfree(c->utmp_id);
5125 c->selinux_context = mfree(c->selinux_context);
5126 c->apparmor_profile = mfree(c->apparmor_profile);
5127 c->smack_process_label = mfree(c->smack_process_label);
5128
5129 c->restrict_filesystems = set_free(c->restrict_filesystems);
5130
5131 c->syscall_filter = hashmap_free(c->syscall_filter);
5132 c->syscall_archs = set_free(c->syscall_archs);
5133 c->address_families = set_free(c->address_families);
5134
5135 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5136 exec_directory_done(&c->directories[t]);
5137
5138 c->log_level_max = -1;
5139
5140 exec_context_free_log_extra_fields(c);
5141
5142 c->log_ratelimit_interval_usec = 0;
5143 c->log_ratelimit_burst = 0;
5144
5145 c->stdin_data = mfree(c->stdin_data);
5146 c->stdin_data_size = 0;
5147
5148 c->network_namespace_path = mfree(c->network_namespace_path);
5149 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5150
5151 c->log_namespace = mfree(c->log_namespace);
5152
5153 c->load_credentials = hashmap_free(c->load_credentials);
5154 c->set_credentials = hashmap_free(c->set_credentials);
5155 }
5156
5157 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5158 assert(c);
5159
5160 if (!runtime_prefix)
5161 return 0;
5162
5163 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5164 _cleanup_free_ char *p = NULL;
5165
5166 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5167 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5168 else
5169 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5170 if (!p)
5171 return -ENOMEM;
5172
5173 /* We execute this synchronously, since we need to be sure this is gone when we start the
5174 * service next. */
5175 (void) rm_rf(p, REMOVE_ROOT);
5176
5177 char **symlink;
5178 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5179 _cleanup_free_ char *symlink_abs = NULL;
5180
5181 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5182 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5183 else
5184 symlink_abs = path_join(runtime_prefix, *symlink);
5185 if (!symlink_abs)
5186 return -ENOMEM;
5187
5188 (void) unlink(symlink_abs);
5189 }
5190
5191 }
5192
5193 return 0;
5194 }
5195
5196 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5197 _cleanup_free_ char *p = NULL;
5198
5199 assert(c);
5200
5201 if (!runtime_prefix || !unit)
5202 return 0;
5203
5204 p = path_join(runtime_prefix, "credentials", unit);
5205 if (!p)
5206 return -ENOMEM;
5207
5208 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5209 * unmount it, and afterwards remove the mount point */
5210 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5211 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5212
5213 return 0;
5214 }
5215
5216 static void exec_command_done(ExecCommand *c) {
5217 assert(c);
5218
5219 c->path = mfree(c->path);
5220 c->argv = strv_free(c->argv);
5221 }
5222
5223 void exec_command_done_array(ExecCommand *c, size_t n) {
5224 for (size_t i = 0; i < n; i++)
5225 exec_command_done(c+i);
5226 }
5227
5228 ExecCommand* exec_command_free_list(ExecCommand *c) {
5229 ExecCommand *i;
5230
5231 while ((i = c)) {
5232 LIST_REMOVE(command, c, i);
5233 exec_command_done(i);
5234 free(i);
5235 }
5236
5237 return NULL;
5238 }
5239
5240 void exec_command_free_array(ExecCommand **c, size_t n) {
5241 for (size_t i = 0; i < n; i++)
5242 c[i] = exec_command_free_list(c[i]);
5243 }
5244
5245 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5246 for (size_t i = 0; i < n; i++)
5247 exec_status_reset(&c[i].exec_status);
5248 }
5249
5250 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5251 for (size_t i = 0; i < n; i++) {
5252 ExecCommand *z;
5253
5254 LIST_FOREACH(command, z, c[i])
5255 exec_status_reset(&z->exec_status);
5256 }
5257 }
5258
5259 typedef struct InvalidEnvInfo {
5260 const Unit *unit;
5261 const char *path;
5262 } InvalidEnvInfo;
5263
5264 static void invalid_env(const char *p, void *userdata) {
5265 InvalidEnvInfo *info = userdata;
5266
5267 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5268 }
5269
5270 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5271 assert(c);
5272
5273 switch (fd_index) {
5274
5275 case STDIN_FILENO:
5276 if (c->std_input != EXEC_INPUT_NAMED_FD)
5277 return NULL;
5278
5279 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5280
5281 case STDOUT_FILENO:
5282 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5283 return NULL;
5284
5285 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5286
5287 case STDERR_FILENO:
5288 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5289 return NULL;
5290
5291 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5292
5293 default:
5294 return NULL;
5295 }
5296 }
5297
5298 static int exec_context_named_iofds(
5299 const ExecContext *c,
5300 const ExecParameters *p,
5301 int named_iofds[static 3]) {
5302
5303 size_t targets;
5304 const char* stdio_fdname[3];
5305 size_t n_fds;
5306
5307 assert(c);
5308 assert(p);
5309 assert(named_iofds);
5310
5311 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5312 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5313 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5314
5315 for (size_t i = 0; i < 3; i++)
5316 stdio_fdname[i] = exec_context_fdname(c, i);
5317
5318 n_fds = p->n_storage_fds + p->n_socket_fds;
5319
5320 for (size_t i = 0; i < n_fds && targets > 0; i++)
5321 if (named_iofds[STDIN_FILENO] < 0 &&
5322 c->std_input == EXEC_INPUT_NAMED_FD &&
5323 stdio_fdname[STDIN_FILENO] &&
5324 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5325
5326 named_iofds[STDIN_FILENO] = p->fds[i];
5327 targets--;
5328
5329 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5330 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5331 stdio_fdname[STDOUT_FILENO] &&
5332 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5333
5334 named_iofds[STDOUT_FILENO] = p->fds[i];
5335 targets--;
5336
5337 } else if (named_iofds[STDERR_FILENO] < 0 &&
5338 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5339 stdio_fdname[STDERR_FILENO] &&
5340 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5341
5342 named_iofds[STDERR_FILENO] = p->fds[i];
5343 targets--;
5344 }
5345
5346 return targets == 0 ? 0 : -ENOENT;
5347 }
5348
5349 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
5350 char **i, **r = NULL;
5351
5352 assert(c);
5353 assert(l);
5354
5355 STRV_FOREACH(i, c->environment_files) {
5356 char *fn;
5357 int k;
5358 bool ignore = false;
5359 char **p;
5360 _cleanup_globfree_ glob_t pglob = {};
5361
5362 fn = *i;
5363
5364 if (fn[0] == '-') {
5365 ignore = true;
5366 fn++;
5367 }
5368
5369 if (!path_is_absolute(fn)) {
5370 if (ignore)
5371 continue;
5372
5373 strv_free(r);
5374 return -EINVAL;
5375 }
5376
5377 /* Filename supports globbing, take all matching files */
5378 k = safe_glob(fn, 0, &pglob);
5379 if (k < 0) {
5380 if (ignore)
5381 continue;
5382
5383 strv_free(r);
5384 return k;
5385 }
5386
5387 /* When we don't match anything, -ENOENT should be returned */
5388 assert(pglob.gl_pathc > 0);
5389
5390 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5391 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
5392 if (k < 0) {
5393 if (ignore)
5394 continue;
5395
5396 strv_free(r);
5397 return k;
5398 }
5399 /* Log invalid environment variables with filename */
5400 if (p) {
5401 InvalidEnvInfo info = {
5402 .unit = unit,
5403 .path = pglob.gl_pathv[n]
5404 };
5405
5406 p = strv_env_clean_with_callback(p, invalid_env, &info);
5407 }
5408
5409 if (!r)
5410 r = p;
5411 else {
5412 char **m;
5413
5414 m = strv_env_merge(r, p);
5415 strv_free(r);
5416 strv_free(p);
5417 if (!m)
5418 return -ENOMEM;
5419
5420 r = m;
5421 }
5422 }
5423 }
5424
5425 *l = r;
5426
5427 return 0;
5428 }
5429
5430 static bool tty_may_match_dev_console(const char *tty) {
5431 _cleanup_free_ char *resolved = NULL;
5432
5433 if (!tty)
5434 return true;
5435
5436 tty = skip_dev_prefix(tty);
5437
5438 /* trivial identity? */
5439 if (streq(tty, "console"))
5440 return true;
5441
5442 if (resolve_dev_console(&resolved) < 0)
5443 return true; /* if we could not resolve, assume it may */
5444
5445 /* "tty0" means the active VC, so it may be the same sometimes */
5446 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5447 }
5448
5449 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5450 assert(ec);
5451
5452 return ec->tty_reset ||
5453 ec->tty_vhangup ||
5454 ec->tty_vt_disallocate ||
5455 is_terminal_input(ec->std_input) ||
5456 is_terminal_output(ec->std_output) ||
5457 is_terminal_output(ec->std_error);
5458 }
5459
5460 bool exec_context_may_touch_console(const ExecContext *ec) {
5461
5462 return exec_context_may_touch_tty(ec) &&
5463 tty_may_match_dev_console(exec_context_tty_path(ec));
5464 }
5465
5466 static void strv_fprintf(FILE *f, char **l) {
5467 char **g;
5468
5469 assert(f);
5470
5471 STRV_FOREACH(g, l)
5472 fprintf(f, " %s", *g);
5473 }
5474
5475 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5476 assert(f);
5477 assert(prefix);
5478 assert(name);
5479
5480 if (!strv_isempty(strv)) {
5481 fprintf(f, "%s%s:", prefix, name);
5482 strv_fprintf(f, strv);
5483 fputs("\n", f);
5484 }
5485 }
5486
5487 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5488 char **e, **d;
5489 int r;
5490
5491 assert(c);
5492 assert(f);
5493
5494 prefix = strempty(prefix);
5495
5496 fprintf(f,
5497 "%sUMask: %04o\n"
5498 "%sWorkingDirectory: %s\n"
5499 "%sRootDirectory: %s\n"
5500 "%sNonBlocking: %s\n"
5501 "%sPrivateTmp: %s\n"
5502 "%sPrivateDevices: %s\n"
5503 "%sProtectKernelTunables: %s\n"
5504 "%sProtectKernelModules: %s\n"
5505 "%sProtectKernelLogs: %s\n"
5506 "%sProtectClock: %s\n"
5507 "%sProtectControlGroups: %s\n"
5508 "%sPrivateNetwork: %s\n"
5509 "%sPrivateUsers: %s\n"
5510 "%sProtectHome: %s\n"
5511 "%sProtectSystem: %s\n"
5512 "%sMountAPIVFS: %s\n"
5513 "%sIgnoreSIGPIPE: %s\n"
5514 "%sMemoryDenyWriteExecute: %s\n"
5515 "%sRestrictRealtime: %s\n"
5516 "%sRestrictSUIDSGID: %s\n"
5517 "%sKeyringMode: %s\n"
5518 "%sProtectHostname: %s\n"
5519 "%sProtectProc: %s\n"
5520 "%sProcSubset: %s\n",
5521 prefix, c->umask,
5522 prefix, empty_to_root(c->working_directory),
5523 prefix, empty_to_root(c->root_directory),
5524 prefix, yes_no(c->non_blocking),
5525 prefix, yes_no(c->private_tmp),
5526 prefix, yes_no(c->private_devices),
5527 prefix, yes_no(c->protect_kernel_tunables),
5528 prefix, yes_no(c->protect_kernel_modules),
5529 prefix, yes_no(c->protect_kernel_logs),
5530 prefix, yes_no(c->protect_clock),
5531 prefix, yes_no(c->protect_control_groups),
5532 prefix, yes_no(c->private_network),
5533 prefix, yes_no(c->private_users),
5534 prefix, protect_home_to_string(c->protect_home),
5535 prefix, protect_system_to_string(c->protect_system),
5536 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5537 prefix, yes_no(c->ignore_sigpipe),
5538 prefix, yes_no(c->memory_deny_write_execute),
5539 prefix, yes_no(c->restrict_realtime),
5540 prefix, yes_no(c->restrict_suid_sgid),
5541 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5542 prefix, yes_no(c->protect_hostname),
5543 prefix, protect_proc_to_string(c->protect_proc),
5544 prefix, proc_subset_to_string(c->proc_subset));
5545
5546 if (c->root_image)
5547 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5548
5549 if (c->root_image_options) {
5550 MountOptions *o;
5551
5552 fprintf(f, "%sRootImageOptions:", prefix);
5553 LIST_FOREACH(mount_options, o, c->root_image_options)
5554 if (!isempty(o->options))
5555 fprintf(f, " %s:%s",
5556 partition_designator_to_string(o->partition_designator),
5557 o->options);
5558 fprintf(f, "\n");
5559 }
5560
5561 if (c->root_hash) {
5562 _cleanup_free_ char *encoded = NULL;
5563 encoded = hexmem(c->root_hash, c->root_hash_size);
5564 if (encoded)
5565 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5566 }
5567
5568 if (c->root_hash_path)
5569 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5570
5571 if (c->root_hash_sig) {
5572 _cleanup_free_ char *encoded = NULL;
5573 ssize_t len;
5574 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5575 if (len)
5576 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5577 }
5578
5579 if (c->root_hash_sig_path)
5580 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5581
5582 if (c->root_verity)
5583 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5584
5585 STRV_FOREACH(e, c->environment)
5586 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5587
5588 STRV_FOREACH(e, c->environment_files)
5589 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5590
5591 STRV_FOREACH(e, c->pass_environment)
5592 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5593
5594 STRV_FOREACH(e, c->unset_environment)
5595 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5596
5597 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5598
5599 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5600 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5601
5602 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5603 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5604
5605 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5606 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5607 }
5608 }
5609
5610 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5611
5612 if (c->nice_set)
5613 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5614
5615 if (c->oom_score_adjust_set)
5616 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5617
5618 if (c->coredump_filter_set)
5619 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5620
5621 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5622 if (c->rlimit[i]) {
5623 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5624 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5625 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5626 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5627 }
5628
5629 if (c->ioprio_set) {
5630 _cleanup_free_ char *class_str = NULL;
5631
5632 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5633 if (r >= 0)
5634 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5635
5636 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5637 }
5638
5639 if (c->cpu_sched_set) {
5640 _cleanup_free_ char *policy_str = NULL;
5641
5642 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5643 if (r >= 0)
5644 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5645
5646 fprintf(f,
5647 "%sCPUSchedulingPriority: %i\n"
5648 "%sCPUSchedulingResetOnFork: %s\n",
5649 prefix, c->cpu_sched_priority,
5650 prefix, yes_no(c->cpu_sched_reset_on_fork));
5651 }
5652
5653 if (c->cpu_set.set) {
5654 _cleanup_free_ char *affinity = NULL;
5655
5656 affinity = cpu_set_to_range_string(&c->cpu_set);
5657 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5658 }
5659
5660 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5661 _cleanup_free_ char *nodes = NULL;
5662
5663 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5664 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5665 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5666 }
5667
5668 if (c->timer_slack_nsec != NSEC_INFINITY)
5669 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5670
5671 fprintf(f,
5672 "%sStandardInput: %s\n"
5673 "%sStandardOutput: %s\n"
5674 "%sStandardError: %s\n",
5675 prefix, exec_input_to_string(c->std_input),
5676 prefix, exec_output_to_string(c->std_output),
5677 prefix, exec_output_to_string(c->std_error));
5678
5679 if (c->std_input == EXEC_INPUT_NAMED_FD)
5680 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5681 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5682 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5683 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5684 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5685
5686 if (c->std_input == EXEC_INPUT_FILE)
5687 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5688 if (c->std_output == EXEC_OUTPUT_FILE)
5689 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5690 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5691 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5692 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5693 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5694 if (c->std_error == EXEC_OUTPUT_FILE)
5695 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5696 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5697 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5698 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5699 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5700
5701 if (c->tty_path)
5702 fprintf(f,
5703 "%sTTYPath: %s\n"
5704 "%sTTYReset: %s\n"
5705 "%sTTYVHangup: %s\n"
5706 "%sTTYVTDisallocate: %s\n",
5707 prefix, c->tty_path,
5708 prefix, yes_no(c->tty_reset),
5709 prefix, yes_no(c->tty_vhangup),
5710 prefix, yes_no(c->tty_vt_disallocate));
5711
5712 if (IN_SET(c->std_output,
5713 EXEC_OUTPUT_KMSG,
5714 EXEC_OUTPUT_JOURNAL,
5715 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5716 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5717 IN_SET(c->std_error,
5718 EXEC_OUTPUT_KMSG,
5719 EXEC_OUTPUT_JOURNAL,
5720 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5721 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5722
5723 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5724
5725 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5726 if (r >= 0)
5727 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5728
5729 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5730 if (r >= 0)
5731 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5732 }
5733
5734 if (c->log_level_max >= 0) {
5735 _cleanup_free_ char *t = NULL;
5736
5737 (void) log_level_to_string_alloc(c->log_level_max, &t);
5738
5739 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5740 }
5741
5742 if (c->log_ratelimit_interval_usec > 0)
5743 fprintf(f,
5744 "%sLogRateLimitIntervalSec: %s\n",
5745 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5746
5747 if (c->log_ratelimit_burst > 0)
5748 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5749
5750 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5751 fprintf(f, "%sLogExtraFields: ", prefix);
5752 fwrite(c->log_extra_fields[j].iov_base,
5753 1, c->log_extra_fields[j].iov_len,
5754 f);
5755 fputc('\n', f);
5756 }
5757
5758 if (c->log_namespace)
5759 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5760
5761 if (c->secure_bits) {
5762 _cleanup_free_ char *str = NULL;
5763
5764 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5765 if (r >= 0)
5766 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5767 }
5768
5769 if (c->capability_bounding_set != CAP_ALL) {
5770 _cleanup_free_ char *str = NULL;
5771
5772 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5773 if (r >= 0)
5774 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5775 }
5776
5777 if (c->capability_ambient_set != 0) {
5778 _cleanup_free_ char *str = NULL;
5779
5780 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5781 if (r >= 0)
5782 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5783 }
5784
5785 if (c->user)
5786 fprintf(f, "%sUser: %s\n", prefix, c->user);
5787 if (c->group)
5788 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5789
5790 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5791
5792 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5793
5794 if (c->pam_name)
5795 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5796
5797 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5798 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5799 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5800 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5801 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5802 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
5803
5804 for (size_t i = 0; i < c->n_bind_mounts; i++)
5805 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5806 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5807 c->bind_mounts[i].ignore_enoent ? "-": "",
5808 c->bind_mounts[i].source,
5809 c->bind_mounts[i].destination,
5810 c->bind_mounts[i].recursive ? "rbind" : "norbind");
5811
5812 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5813 const TemporaryFileSystem *t = c->temporary_filesystems + i;
5814
5815 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5816 t->path,
5817 isempty(t->options) ? "" : ":",
5818 strempty(t->options));
5819 }
5820
5821 if (c->utmp_id)
5822 fprintf(f,
5823 "%sUtmpIdentifier: %s\n",
5824 prefix, c->utmp_id);
5825
5826 if (c->selinux_context)
5827 fprintf(f,
5828 "%sSELinuxContext: %s%s\n",
5829 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
5830
5831 if (c->apparmor_profile)
5832 fprintf(f,
5833 "%sAppArmorProfile: %s%s\n",
5834 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5835
5836 if (c->smack_process_label)
5837 fprintf(f,
5838 "%sSmackProcessLabel: %s%s\n",
5839 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5840
5841 if (c->personality != PERSONALITY_INVALID)
5842 fprintf(f,
5843 "%sPersonality: %s\n",
5844 prefix, strna(personality_to_string(c->personality)));
5845
5846 fprintf(f,
5847 "%sLockPersonality: %s\n",
5848 prefix, yes_no(c->lock_personality));
5849
5850 if (c->syscall_filter) {
5851 #if HAVE_SECCOMP
5852 void *id, *val;
5853 bool first = true;
5854 #endif
5855
5856 fprintf(f,
5857 "%sSystemCallFilter: ",
5858 prefix);
5859
5860 if (!c->syscall_allow_list)
5861 fputc('~', f);
5862
5863 #if HAVE_SECCOMP
5864 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
5865 _cleanup_free_ char *name = NULL;
5866 const char *errno_name = NULL;
5867 int num = PTR_TO_INT(val);
5868
5869 if (first)
5870 first = false;
5871 else
5872 fputc(' ', f);
5873
5874 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
5875 fputs(strna(name), f);
5876
5877 if (num >= 0) {
5878 errno_name = seccomp_errno_or_action_to_string(num);
5879 if (errno_name)
5880 fprintf(f, ":%s", errno_name);
5881 else
5882 fprintf(f, ":%d", num);
5883 }
5884 }
5885 #endif
5886
5887 fputc('\n', f);
5888 }
5889
5890 if (c->syscall_archs) {
5891 #if HAVE_SECCOMP
5892 void *id;
5893 #endif
5894
5895 fprintf(f,
5896 "%sSystemCallArchitectures:",
5897 prefix);
5898
5899 #if HAVE_SECCOMP
5900 SET_FOREACH(id, c->syscall_archs)
5901 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5902 #endif
5903 fputc('\n', f);
5904 }
5905
5906 if (exec_context_restrict_namespaces_set(c)) {
5907 _cleanup_free_ char *s = NULL;
5908
5909 r = namespace_flags_to_string(c->restrict_namespaces, &s);
5910 if (r >= 0)
5911 fprintf(f, "%sRestrictNamespaces: %s\n",
5912 prefix, strna(s));
5913 }
5914
5915 #if HAVE_LIBBPF
5916 if (exec_context_restrict_filesystems_set(c))
5917 SET_FOREACH(e, c->restrict_filesystems)
5918 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, *e);
5919 #endif
5920
5921 if (c->network_namespace_path)
5922 fprintf(f,
5923 "%sNetworkNamespacePath: %s\n",
5924 prefix, c->network_namespace_path);
5925
5926 if (c->syscall_errno > 0) {
5927 #if HAVE_SECCOMP
5928 const char *errno_name;
5929 #endif
5930
5931 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5932
5933 #if HAVE_SECCOMP
5934 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
5935 if (errno_name)
5936 fputs(errno_name, f);
5937 else
5938 fprintf(f, "%d", c->syscall_errno);
5939 #endif
5940 fputc('\n', f);
5941 }
5942
5943 for (size_t i = 0; i < c->n_mount_images; i++) {
5944 MountOptions *o;
5945
5946 fprintf(f, "%sMountImages: %s%s:%s", prefix,
5947 c->mount_images[i].ignore_enoent ? "-": "",
5948 c->mount_images[i].source,
5949 c->mount_images[i].destination);
5950 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
5951 fprintf(f, ":%s:%s",
5952 partition_designator_to_string(o->partition_designator),
5953 strempty(o->options));
5954 fprintf(f, "\n");
5955 }
5956
5957 for (size_t i = 0; i < c->n_extension_images; i++) {
5958 MountOptions *o;
5959
5960 fprintf(f, "%sExtensionImages: %s%s", prefix,
5961 c->extension_images[i].ignore_enoent ? "-": "",
5962 c->extension_images[i].source);
5963 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5964 fprintf(f, ":%s:%s",
5965 partition_designator_to_string(o->partition_designator),
5966 strempty(o->options));
5967 fprintf(f, "\n");
5968 }
5969 }
5970
5971 bool exec_context_maintains_privileges(const ExecContext *c) {
5972 assert(c);
5973
5974 /* Returns true if the process forked off would run under
5975 * an unchanged UID or as root. */
5976
5977 if (!c->user)
5978 return true;
5979
5980 if (streq(c->user, "root") || streq(c->user, "0"))
5981 return true;
5982
5983 return false;
5984 }
5985
5986 int exec_context_get_effective_ioprio(const ExecContext *c) {
5987 int p;
5988
5989 assert(c);
5990
5991 if (c->ioprio_set)
5992 return c->ioprio;
5993
5994 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5995 if (p < 0)
5996 return ioprio_prio_value(IOPRIO_CLASS_BE, 4);
5997
5998 return p;
5999 }
6000
6001 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6002 assert(c);
6003
6004 /* Explicit setting wins */
6005 if (c->mount_apivfs_set)
6006 return c->mount_apivfs;
6007
6008 /* Default to "yes" if root directory or image are specified */
6009 if (exec_context_with_rootfs(c))
6010 return true;
6011
6012 return false;
6013 }
6014
6015 void exec_context_free_log_extra_fields(ExecContext *c) {
6016 assert(c);
6017
6018 for (size_t l = 0; l < c->n_log_extra_fields; l++)
6019 free(c->log_extra_fields[l].iov_base);
6020 c->log_extra_fields = mfree(c->log_extra_fields);
6021 c->n_log_extra_fields = 0;
6022 }
6023
6024 void exec_context_revert_tty(ExecContext *c) {
6025 _cleanup_close_ int fd = -1;
6026 const char *path;
6027 struct stat st;
6028 int r;
6029
6030 assert(c);
6031
6032 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6033 exec_context_tty_reset(c, NULL);
6034
6035 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6036 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6037 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6038 if (!exec_context_may_touch_tty(c))
6039 return;
6040
6041 path = exec_context_tty_path(c);
6042 if (!path)
6043 return;
6044
6045 fd = open(path, O_PATH|O_CLOEXEC);
6046 if (fd < 0)
6047 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6048 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6049 path);
6050
6051 if (fstat(fd, &st) < 0)
6052 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6053
6054 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6055 * if things are a character device, since a proper check either means we'd have to open the TTY and
6056 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6057 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6058 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6059 if (!S_ISCHR(st.st_mode))
6060 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6061
6062 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6063 if (r < 0)
6064 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6065 }
6066
6067 int exec_context_get_clean_directories(
6068 ExecContext *c,
6069 char **prefix,
6070 ExecCleanMask mask,
6071 char ***ret) {
6072
6073 _cleanup_strv_free_ char **l = NULL;
6074 int r;
6075
6076 assert(c);
6077 assert(prefix);
6078 assert(ret);
6079
6080 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6081 if (!FLAGS_SET(mask, 1U << t))
6082 continue;
6083
6084 if (!prefix[t])
6085 continue;
6086
6087 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6088 char *j;
6089
6090 j = path_join(prefix[t], c->directories[t].items[i].path);
6091 if (!j)
6092 return -ENOMEM;
6093
6094 r = strv_consume(&l, j);
6095 if (r < 0)
6096 return r;
6097
6098 /* Also remove private directories unconditionally. */
6099 if (t != EXEC_DIRECTORY_CONFIGURATION) {
6100 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6101 if (!j)
6102 return -ENOMEM;
6103
6104 r = strv_consume(&l, j);
6105 if (r < 0)
6106 return r;
6107 }
6108
6109 char **symlink;
6110 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6111 j = path_join(prefix[t], *symlink);
6112 if (!j)
6113 return -ENOMEM;
6114
6115 r = strv_consume(&l, j);
6116 if (r < 0)
6117 return r;
6118 }
6119 }
6120 }
6121
6122 *ret = TAKE_PTR(l);
6123 return 0;
6124 }
6125
6126 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6127 ExecCleanMask mask = 0;
6128
6129 assert(c);
6130 assert(ret);
6131
6132 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6133 if (c->directories[t].n_items > 0)
6134 mask |= 1U << t;
6135
6136 *ret = mask;
6137 return 0;
6138 }
6139
6140 void exec_status_start(ExecStatus *s, pid_t pid) {
6141 assert(s);
6142
6143 *s = (ExecStatus) {
6144 .pid = pid,
6145 };
6146
6147 dual_timestamp_get(&s->start_timestamp);
6148 }
6149
6150 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6151 assert(s);
6152
6153 if (s->pid != pid)
6154 *s = (ExecStatus) {
6155 .pid = pid,
6156 };
6157
6158 dual_timestamp_get(&s->exit_timestamp);
6159
6160 s->code = code;
6161 s->status = status;
6162
6163 if (context && context->utmp_id)
6164 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6165 }
6166
6167 void exec_status_reset(ExecStatus *s) {
6168 assert(s);
6169
6170 *s = (ExecStatus) {};
6171 }
6172
6173 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6174 assert(s);
6175 assert(f);
6176
6177 if (s->pid <= 0)
6178 return;
6179
6180 prefix = strempty(prefix);
6181
6182 fprintf(f,
6183 "%sPID: "PID_FMT"\n",
6184 prefix, s->pid);
6185
6186 if (dual_timestamp_is_set(&s->start_timestamp))
6187 fprintf(f,
6188 "%sStart Timestamp: %s\n",
6189 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6190
6191 if (dual_timestamp_is_set(&s->exit_timestamp))
6192 fprintf(f,
6193 "%sExit Timestamp: %s\n"
6194 "%sExit Code: %s\n"
6195 "%sExit Status: %i\n",
6196 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6197 prefix, sigchld_code_to_string(s->code),
6198 prefix, s->status);
6199 }
6200
6201 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6202 _cleanup_free_ char *cmd = NULL;
6203 const char *prefix2;
6204
6205 assert(c);
6206 assert(f);
6207
6208 prefix = strempty(prefix);
6209 prefix2 = strjoina(prefix, "\t");
6210
6211 cmd = quote_command_line(c->argv);
6212 fprintf(f,
6213 "%sCommand Line: %s\n",
6214 prefix, cmd ? cmd : strerror_safe(ENOMEM));
6215
6216 exec_status_dump(&c->exec_status, f, prefix2);
6217 }
6218
6219 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6220 assert(f);
6221
6222 prefix = strempty(prefix);
6223
6224 LIST_FOREACH(command, c, c)
6225 exec_command_dump(c, f, prefix);
6226 }
6227
6228 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6229 ExecCommand *end;
6230
6231 assert(l);
6232 assert(e);
6233
6234 if (*l) {
6235 /* It's kind of important, that we keep the order here */
6236 LIST_FIND_TAIL(command, *l, end);
6237 LIST_INSERT_AFTER(command, *l, end, e);
6238 } else
6239 *l = e;
6240 }
6241
6242 int exec_command_set(ExecCommand *c, const char *path, ...) {
6243 va_list ap;
6244 char **l, *p;
6245
6246 assert(c);
6247 assert(path);
6248
6249 va_start(ap, path);
6250 l = strv_new_ap(path, ap);
6251 va_end(ap);
6252
6253 if (!l)
6254 return -ENOMEM;
6255
6256 p = strdup(path);
6257 if (!p) {
6258 strv_free(l);
6259 return -ENOMEM;
6260 }
6261
6262 free_and_replace(c->path, p);
6263
6264 return strv_free_and_replace(c->argv, l);
6265 }
6266
6267 int exec_command_append(ExecCommand *c, const char *path, ...) {
6268 _cleanup_strv_free_ char **l = NULL;
6269 va_list ap;
6270 int r;
6271
6272 assert(c);
6273 assert(path);
6274
6275 va_start(ap, path);
6276 l = strv_new_ap(path, ap);
6277 va_end(ap);
6278
6279 if (!l)
6280 return -ENOMEM;
6281
6282 r = strv_extend_strv(&c->argv, l, false);
6283 if (r < 0)
6284 return r;
6285
6286 return 0;
6287 }
6288
6289 static void *remove_tmpdir_thread(void *p) {
6290 _cleanup_free_ char *path = p;
6291
6292 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6293 return NULL;
6294 }
6295
6296 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6297 int r;
6298
6299 if (!rt)
6300 return NULL;
6301
6302 if (rt->manager)
6303 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6304
6305 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6306
6307 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6308 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6309
6310 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6311 if (r < 0)
6312 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6313 else
6314 rt->tmp_dir = NULL;
6315 }
6316
6317 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6318 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6319
6320 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6321 if (r < 0)
6322 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6323 else
6324 rt->var_tmp_dir = NULL;
6325 }
6326
6327 rt->id = mfree(rt->id);
6328 rt->tmp_dir = mfree(rt->tmp_dir);
6329 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6330 safe_close_pair(rt->netns_storage_socket);
6331 safe_close_pair(rt->ipcns_storage_socket);
6332 return mfree(rt);
6333 }
6334
6335 static void exec_runtime_freep(ExecRuntime **rt) {
6336 (void) exec_runtime_free(*rt, false);
6337 }
6338
6339 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6340 _cleanup_free_ char *id_copy = NULL;
6341 ExecRuntime *n;
6342
6343 assert(ret);
6344
6345 id_copy = strdup(id);
6346 if (!id_copy)
6347 return -ENOMEM;
6348
6349 n = new(ExecRuntime, 1);
6350 if (!n)
6351 return -ENOMEM;
6352
6353 *n = (ExecRuntime) {
6354 .id = TAKE_PTR(id_copy),
6355 .netns_storage_socket = { -1, -1 },
6356 .ipcns_storage_socket = { -1, -1 },
6357 };
6358
6359 *ret = n;
6360 return 0;
6361 }
6362
6363 static int exec_runtime_add(
6364 Manager *m,
6365 const char *id,
6366 char **tmp_dir,
6367 char **var_tmp_dir,
6368 int netns_storage_socket[2],
6369 int ipcns_storage_socket[2],
6370 ExecRuntime **ret) {
6371
6372 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6373 int r;
6374
6375 assert(m);
6376 assert(id);
6377
6378 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6379
6380 r = exec_runtime_allocate(&rt, id);
6381 if (r < 0)
6382 return r;
6383
6384 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6385 if (r < 0)
6386 return r;
6387
6388 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6389 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6390 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6391
6392 if (netns_storage_socket) {
6393 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6394 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6395 }
6396
6397 if (ipcns_storage_socket) {
6398 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6399 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6400 }
6401
6402 rt->manager = m;
6403
6404 if (ret)
6405 *ret = rt;
6406 /* do not remove created ExecRuntime object when the operation succeeds. */
6407 TAKE_PTR(rt);
6408 return 0;
6409 }
6410
6411 static int exec_runtime_make(
6412 Manager *m,
6413 const ExecContext *c,
6414 const char *id,
6415 ExecRuntime **ret) {
6416
6417 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6418 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6419 int r;
6420
6421 assert(m);
6422 assert(c);
6423 assert(id);
6424
6425 /* It is not necessary to create ExecRuntime object. */
6426 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6427 *ret = NULL;
6428 return 0;
6429 }
6430
6431 if (c->private_tmp &&
6432 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6433 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6434 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6435 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6436 if (r < 0)
6437 return r;
6438 }
6439
6440 if (c->private_network || c->network_namespace_path) {
6441 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6442 return -errno;
6443 }
6444
6445 if (c->private_ipc || c->ipc_namespace_path) {
6446 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6447 return -errno;
6448 }
6449
6450 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6451 if (r < 0)
6452 return r;
6453
6454 return 1;
6455 }
6456
6457 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6458 ExecRuntime *rt;
6459 int r;
6460
6461 assert(m);
6462 assert(id);
6463 assert(ret);
6464
6465 rt = hashmap_get(m->exec_runtime_by_id, id);
6466 if (rt)
6467 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6468 goto ref;
6469
6470 if (!create) {
6471 *ret = NULL;
6472 return 0;
6473 }
6474
6475 /* If not found, then create a new object. */
6476 r = exec_runtime_make(m, c, id, &rt);
6477 if (r < 0)
6478 return r;
6479 if (r == 0) {
6480 /* When r == 0, it is not necessary to create ExecRuntime object. */
6481 *ret = NULL;
6482 return 0;
6483 }
6484
6485 ref:
6486 /* increment reference counter. */
6487 rt->n_ref++;
6488 *ret = rt;
6489 return 1;
6490 }
6491
6492 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6493 if (!rt)
6494 return NULL;
6495
6496 assert(rt->n_ref > 0);
6497
6498 rt->n_ref--;
6499 if (rt->n_ref > 0)
6500 return NULL;
6501
6502 return exec_runtime_free(rt, destroy);
6503 }
6504
6505 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6506 ExecRuntime *rt;
6507
6508 assert(m);
6509 assert(f);
6510 assert(fds);
6511
6512 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6513 fprintf(f, "exec-runtime=%s", rt->id);
6514
6515 if (rt->tmp_dir)
6516 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6517
6518 if (rt->var_tmp_dir)
6519 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6520
6521 if (rt->netns_storage_socket[0] >= 0) {
6522 int copy;
6523
6524 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6525 if (copy < 0)
6526 return copy;
6527
6528 fprintf(f, " netns-socket-0=%i", copy);
6529 }
6530
6531 if (rt->netns_storage_socket[1] >= 0) {
6532 int copy;
6533
6534 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6535 if (copy < 0)
6536 return copy;
6537
6538 fprintf(f, " netns-socket-1=%i", copy);
6539 }
6540
6541 if (rt->ipcns_storage_socket[0] >= 0) {
6542 int copy;
6543
6544 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6545 if (copy < 0)
6546 return copy;
6547
6548 fprintf(f, " ipcns-socket-0=%i", copy);
6549 }
6550
6551 if (rt->ipcns_storage_socket[1] >= 0) {
6552 int copy;
6553
6554 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6555 if (copy < 0)
6556 return copy;
6557
6558 fprintf(f, " ipcns-socket-1=%i", copy);
6559 }
6560
6561 fputc('\n', f);
6562 }
6563
6564 return 0;
6565 }
6566
6567 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6568 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6569 ExecRuntime *rt;
6570 int r;
6571
6572 /* This is for the migration from old (v237 or earlier) deserialization text.
6573 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6574 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6575 * so or not from the serialized text, then we always creates a new object owned by this. */
6576
6577 assert(u);
6578 assert(key);
6579 assert(value);
6580
6581 /* Manager manages ExecRuntime objects by the unit id.
6582 * So, we omit the serialized text when the unit does not have id (yet?)... */
6583 if (isempty(u->id)) {
6584 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6585 return 0;
6586 }
6587
6588 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6589 return log_oom();
6590
6591 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6592 if (!rt) {
6593 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6594 return log_oom();
6595
6596 rt = rt_create;
6597 }
6598
6599 if (streq(key, "tmp-dir")) {
6600 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6601 return -ENOMEM;
6602
6603 } else if (streq(key, "var-tmp-dir")) {
6604 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6605 return -ENOMEM;
6606
6607 } else if (streq(key, "netns-socket-0")) {
6608 int fd;
6609
6610 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6611 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6612 return 0;
6613 }
6614
6615 safe_close(rt->netns_storage_socket[0]);
6616 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6617
6618 } else if (streq(key, "netns-socket-1")) {
6619 int fd;
6620
6621 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6622 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6623 return 0;
6624 }
6625
6626 safe_close(rt->netns_storage_socket[1]);
6627 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6628
6629 } else
6630 return 0;
6631
6632 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6633 if (rt_create) {
6634 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6635 if (r < 0) {
6636 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6637 return 0;
6638 }
6639
6640 rt_create->manager = u->manager;
6641
6642 /* Avoid cleanup */
6643 TAKE_PTR(rt_create);
6644 }
6645
6646 return 1;
6647 }
6648
6649 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6650 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6651 char *id = NULL;
6652 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6653 const char *p, *v = value;
6654 size_t n;
6655
6656 assert(m);
6657 assert(value);
6658 assert(fds);
6659
6660 n = strcspn(v, " ");
6661 id = strndupa_safe(v, n);
6662 if (v[n] != ' ')
6663 goto finalize;
6664 p = v + n + 1;
6665
6666 v = startswith(p, "tmp-dir=");
6667 if (v) {
6668 n = strcspn(v, " ");
6669 tmp_dir = strndup(v, n);
6670 if (!tmp_dir)
6671 return log_oom();
6672 if (v[n] != ' ')
6673 goto finalize;
6674 p = v + n + 1;
6675 }
6676
6677 v = startswith(p, "var-tmp-dir=");
6678 if (v) {
6679 n = strcspn(v, " ");
6680 var_tmp_dir = strndup(v, n);
6681 if (!var_tmp_dir)
6682 return log_oom();
6683 if (v[n] != ' ')
6684 goto finalize;
6685 p = v + n + 1;
6686 }
6687
6688 v = startswith(p, "netns-socket-0=");
6689 if (v) {
6690 char *buf;
6691
6692 n = strcspn(v, " ");
6693 buf = strndupa_safe(v, n);
6694
6695 r = safe_atoi(buf, &netns_fdpair[0]);
6696 if (r < 0)
6697 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6698 if (!fdset_contains(fds, netns_fdpair[0]))
6699 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6700 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6701 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6702 if (v[n] != ' ')
6703 goto finalize;
6704 p = v + n + 1;
6705 }
6706
6707 v = startswith(p, "netns-socket-1=");
6708 if (v) {
6709 char *buf;
6710
6711 n = strcspn(v, " ");
6712 buf = strndupa_safe(v, n);
6713
6714 r = safe_atoi(buf, &netns_fdpair[1]);
6715 if (r < 0)
6716 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6717 if (!fdset_contains(fds, netns_fdpair[1]))
6718 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6719 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6720 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6721 if (v[n] != ' ')
6722 goto finalize;
6723 p = v + n + 1;
6724 }
6725
6726 v = startswith(p, "ipcns-socket-0=");
6727 if (v) {
6728 char *buf;
6729
6730 n = strcspn(v, " ");
6731 buf = strndupa_safe(v, n);
6732
6733 r = safe_atoi(buf, &ipcns_fdpair[0]);
6734 if (r < 0)
6735 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6736 if (!fdset_contains(fds, ipcns_fdpair[0]))
6737 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6738 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6739 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6740 if (v[n] != ' ')
6741 goto finalize;
6742 p = v + n + 1;
6743 }
6744
6745 v = startswith(p, "ipcns-socket-1=");
6746 if (v) {
6747 char *buf;
6748
6749 n = strcspn(v, " ");
6750 buf = strndupa_safe(v, n);
6751
6752 r = safe_atoi(buf, &ipcns_fdpair[1]);
6753 if (r < 0)
6754 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6755 if (!fdset_contains(fds, ipcns_fdpair[1]))
6756 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6757 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6758 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6759 }
6760
6761 finalize:
6762 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6763 if (r < 0)
6764 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6765 return 0;
6766 }
6767
6768 void exec_runtime_vacuum(Manager *m) {
6769 ExecRuntime *rt;
6770
6771 assert(m);
6772
6773 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6774
6775 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6776 if (rt->n_ref > 0)
6777 continue;
6778
6779 (void) exec_runtime_free(rt, false);
6780 }
6781 }
6782
6783 void exec_params_clear(ExecParameters *p) {
6784 if (!p)
6785 return;
6786
6787 p->environment = strv_free(p->environment);
6788 p->fd_names = strv_free(p->fd_names);
6789 p->fds = mfree(p->fds);
6790 p->exec_fd = safe_close(p->exec_fd);
6791 }
6792
6793 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6794 if (!sc)
6795 return NULL;
6796
6797 free(sc->id);
6798 free(sc->data);
6799 return mfree(sc);
6800 }
6801
6802 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
6803 if (!lc)
6804 return NULL;
6805
6806 free(lc->id);
6807 free(lc->path);
6808 return mfree(lc);
6809 }
6810
6811 void exec_directory_done(ExecDirectory *d) {
6812 if (!d)
6813 return;
6814
6815 for (size_t i = 0; i < d->n_items; i++) {
6816 free(d->items[i].path);
6817 strv_free(d->items[i].symlinks);
6818 }
6819
6820 d->items = mfree(d->items);
6821 d->n_items = 0;
6822 d->mode = 0755;
6823 }
6824
6825 int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
6826 _cleanup_strv_free_ char **s = NULL;
6827 _cleanup_free_ char *p = NULL;
6828
6829 assert(d);
6830 assert(n);
6831 assert(path);
6832
6833 p = strdup(path);
6834 if (!p)
6835 return -ENOMEM;
6836
6837 if (symlinks) {
6838 s = strv_copy(symlinks);
6839 if (!s)
6840 return -ENOMEM;
6841 }
6842
6843 if (!GREEDY_REALLOC(*d, *n + 1))
6844 return -ENOMEM;
6845
6846 (*d)[(*n) ++] = (ExecDirectoryItem) {
6847 .path = TAKE_PTR(p),
6848 .symlinks = TAKE_PTR(s),
6849 };
6850
6851 return 0;
6852 }
6853
6854 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6855 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
6856
6857 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6858 [EXEC_INPUT_NULL] = "null",
6859 [EXEC_INPUT_TTY] = "tty",
6860 [EXEC_INPUT_TTY_FORCE] = "tty-force",
6861 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
6862 [EXEC_INPUT_SOCKET] = "socket",
6863 [EXEC_INPUT_NAMED_FD] = "fd",
6864 [EXEC_INPUT_DATA] = "data",
6865 [EXEC_INPUT_FILE] = "file",
6866 };
6867
6868 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6869
6870 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
6871 [EXEC_OUTPUT_INHERIT] = "inherit",
6872 [EXEC_OUTPUT_NULL] = "null",
6873 [EXEC_OUTPUT_TTY] = "tty",
6874 [EXEC_OUTPUT_KMSG] = "kmsg",
6875 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
6876 [EXEC_OUTPUT_JOURNAL] = "journal",
6877 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
6878 [EXEC_OUTPUT_SOCKET] = "socket",
6879 [EXEC_OUTPUT_NAMED_FD] = "fd",
6880 [EXEC_OUTPUT_FILE] = "file",
6881 [EXEC_OUTPUT_FILE_APPEND] = "append",
6882 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
6883 };
6884
6885 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
6886
6887 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6888 [EXEC_UTMP_INIT] = "init",
6889 [EXEC_UTMP_LOGIN] = "login",
6890 [EXEC_UTMP_USER] = "user",
6891 };
6892
6893 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
6894
6895 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6896 [EXEC_PRESERVE_NO] = "no",
6897 [EXEC_PRESERVE_YES] = "yes",
6898 [EXEC_PRESERVE_RESTART] = "restart",
6899 };
6900
6901 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
6902
6903 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
6904 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6905 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6906 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6907 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6908 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6909 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6910 };
6911
6912 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
6913
6914 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
6915 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6916 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
6917 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
6918 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
6919 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
6920 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
6921 };
6922
6923 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
6924
6925 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6926 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6927 * directories, specifically .timer units with their timestamp touch file. */
6928 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6929 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6930 [EXEC_DIRECTORY_STATE] = "state",
6931 [EXEC_DIRECTORY_CACHE] = "cache",
6932 [EXEC_DIRECTORY_LOGS] = "logs",
6933 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6934 };
6935
6936 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6937
6938 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6939 * the service payload in. */
6940 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6941 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6942 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6943 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6944 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6945 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6946 };
6947
6948 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6949
6950 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6951 [EXEC_KEYRING_INHERIT] = "inherit",
6952 [EXEC_KEYRING_PRIVATE] = "private",
6953 [EXEC_KEYRING_SHARED] = "shared",
6954 };
6955
6956 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);