]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
core: fix invalid assertion
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/personality.h>
10 #include <sys/prctl.h>
11 #include <sys/shm.h>
12 #include <sys/types.h>
13 #include <sys/un.h>
14 #include <unistd.h>
15 #include <utmpx.h>
16
17 #if HAVE_PAM
18 #include <security/pam_appl.h>
19 #endif
20
21 #if HAVE_SELINUX
22 #include <selinux/selinux.h>
23 #endif
24
25 #if HAVE_SECCOMP
26 #include <seccomp.h>
27 #endif
28
29 #if HAVE_APPARMOR
30 #include <sys/apparmor.h>
31 #endif
32
33 #include "sd-messages.h"
34
35 #include "af-list.h"
36 #include "alloc-util.h"
37 #if HAVE_APPARMOR
38 #include "apparmor-util.h"
39 #endif
40 #include "async.h"
41 #include "barrier.h"
42 #include "cap-list.h"
43 #include "capability-util.h"
44 #include "chown-recursive.h"
45 #include "cgroup-setup.h"
46 #include "cpu-set-util.h"
47 #include "def.h"
48 #include "env-file.h"
49 #include "env-util.h"
50 #include "errno-list.h"
51 #include "execute.h"
52 #include "exit-status.h"
53 #include "fd-util.h"
54 #include "format-util.h"
55 #include "fs-util.h"
56 #include "glob-util.h"
57 #include "hexdecoct.h"
58 #include "io-util.h"
59 #include "ioprio.h"
60 #include "label.h"
61 #include "log.h"
62 #include "macro.h"
63 #include "manager.h"
64 #include "memory-util.h"
65 #include "missing_fs.h"
66 #include "mkdir.h"
67 #include "namespace.h"
68 #include "parse-util.h"
69 #include "path-util.h"
70 #include "process-util.h"
71 #include "rlimit-util.h"
72 #include "rm-rf.h"
73 #if HAVE_SECCOMP
74 #include "seccomp-util.h"
75 #endif
76 #include "securebits-util.h"
77 #include "selinux-util.h"
78 #include "signal-util.h"
79 #include "smack-util.h"
80 #include "socket-util.h"
81 #include "special.h"
82 #include "stat-util.h"
83 #include "string-table.h"
84 #include "string-util.h"
85 #include "strv.h"
86 #include "syslog-util.h"
87 #include "terminal-util.h"
88 #include "umask-util.h"
89 #include "unit.h"
90 #include "user-util.h"
91 #include "utmp-wtmp.h"
92
93 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
94 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
95
96 #define SNDBUF_SIZE (8*1024*1024)
97
98 static int shift_fds(int fds[], size_t n_fds) {
99 int start, restart_from;
100
101 if (n_fds <= 0)
102 return 0;
103
104 /* Modifies the fds array! (sorts it) */
105
106 assert(fds);
107
108 start = 0;
109 for (;;) {
110 int i;
111
112 restart_from = -1;
113
114 for (i = start; i < (int) n_fds; i++) {
115 int nfd;
116
117 /* Already at right index? */
118 if (fds[i] == i+3)
119 continue;
120
121 nfd = fcntl(fds[i], F_DUPFD, i + 3);
122 if (nfd < 0)
123 return -errno;
124
125 safe_close(fds[i]);
126 fds[i] = nfd;
127
128 /* Hmm, the fd we wanted isn't free? Then
129 * let's remember that and try again from here */
130 if (nfd != i+3 && restart_from < 0)
131 restart_from = i;
132 }
133
134 if (restart_from < 0)
135 break;
136
137 start = restart_from;
138 }
139
140 return 0;
141 }
142
143 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
144 size_t i, n_fds;
145 int r;
146
147 n_fds = n_socket_fds + n_storage_fds;
148 if (n_fds <= 0)
149 return 0;
150
151 assert(fds);
152
153 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
154 * O_NONBLOCK only applies to socket activation though. */
155
156 for (i = 0; i < n_fds; i++) {
157
158 if (i < n_socket_fds) {
159 r = fd_nonblock(fds[i], nonblock);
160 if (r < 0)
161 return r;
162 }
163
164 /* We unconditionally drop FD_CLOEXEC from the fds,
165 * since after all we want to pass these fds to our
166 * children */
167
168 r = fd_cloexec(fds[i], false);
169 if (r < 0)
170 return r;
171 }
172
173 return 0;
174 }
175
176 static const char *exec_context_tty_path(const ExecContext *context) {
177 assert(context);
178
179 if (context->stdio_as_fds)
180 return NULL;
181
182 if (context->tty_path)
183 return context->tty_path;
184
185 return "/dev/console";
186 }
187
188 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
189 const char *path;
190
191 assert(context);
192
193 path = exec_context_tty_path(context);
194
195 if (context->tty_vhangup) {
196 if (p && p->stdin_fd >= 0)
197 (void) terminal_vhangup_fd(p->stdin_fd);
198 else if (path)
199 (void) terminal_vhangup(path);
200 }
201
202 if (context->tty_reset) {
203 if (p && p->stdin_fd >= 0)
204 (void) reset_terminal_fd(p->stdin_fd, true);
205 else if (path)
206 (void) reset_terminal(path);
207 }
208
209 if (context->tty_vt_disallocate && path)
210 (void) vt_disallocate(path);
211 }
212
213 static bool is_terminal_input(ExecInput i) {
214 return IN_SET(i,
215 EXEC_INPUT_TTY,
216 EXEC_INPUT_TTY_FORCE,
217 EXEC_INPUT_TTY_FAIL);
218 }
219
220 static bool is_terminal_output(ExecOutput o) {
221 return IN_SET(o,
222 EXEC_OUTPUT_TTY,
223 EXEC_OUTPUT_KMSG_AND_CONSOLE,
224 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
225 }
226
227 static bool is_kmsg_output(ExecOutput o) {
228 return IN_SET(o,
229 EXEC_OUTPUT_KMSG,
230 EXEC_OUTPUT_KMSG_AND_CONSOLE);
231 }
232
233 static bool exec_context_needs_term(const ExecContext *c) {
234 assert(c);
235
236 /* Return true if the execution context suggests we should set $TERM to something useful. */
237
238 if (is_terminal_input(c->std_input))
239 return true;
240
241 if (is_terminal_output(c->std_output))
242 return true;
243
244 if (is_terminal_output(c->std_error))
245 return true;
246
247 return !!c->tty_path;
248 }
249
250 static int open_null_as(int flags, int nfd) {
251 int fd;
252
253 assert(nfd >= 0);
254
255 fd = open("/dev/null", flags|O_NOCTTY);
256 if (fd < 0)
257 return -errno;
258
259 return move_fd(fd, nfd, false);
260 }
261
262 static int connect_journal_socket(
263 int fd,
264 const char *log_namespace,
265 uid_t uid,
266 gid_t gid) {
267
268 union sockaddr_union sa;
269 socklen_t sa_len;
270 uid_t olduid = UID_INVALID;
271 gid_t oldgid = GID_INVALID;
272 const char *j;
273 int r;
274
275 j = log_namespace ?
276 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
277 "/run/systemd/journal/stdout";
278 r = sockaddr_un_set_path(&sa.un, j);
279 if (r < 0)
280 return r;
281 sa_len = r;
282
283 if (gid_is_valid(gid)) {
284 oldgid = getgid();
285
286 if (setegid(gid) < 0)
287 return -errno;
288 }
289
290 if (uid_is_valid(uid)) {
291 olduid = getuid();
292
293 if (seteuid(uid) < 0) {
294 r = -errno;
295 goto restore_gid;
296 }
297 }
298
299 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
300
301 /* If we fail to restore the uid or gid, things will likely
302 fail later on. This should only happen if an LSM interferes. */
303
304 if (uid_is_valid(uid))
305 (void) seteuid(olduid);
306
307 restore_gid:
308 if (gid_is_valid(gid))
309 (void) setegid(oldgid);
310
311 return r;
312 }
313
314 static int connect_logger_as(
315 const Unit *unit,
316 const ExecContext *context,
317 const ExecParameters *params,
318 ExecOutput output,
319 const char *ident,
320 int nfd,
321 uid_t uid,
322 gid_t gid) {
323
324 _cleanup_close_ int fd = -1;
325 int r;
326
327 assert(context);
328 assert(params);
329 assert(output < _EXEC_OUTPUT_MAX);
330 assert(ident);
331 assert(nfd >= 0);
332
333 fd = socket(AF_UNIX, SOCK_STREAM, 0);
334 if (fd < 0)
335 return -errno;
336
337 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
338 if (r < 0)
339 return r;
340
341 if (shutdown(fd, SHUT_RD) < 0)
342 return -errno;
343
344 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
345
346 if (dprintf(fd,
347 "%s\n"
348 "%s\n"
349 "%i\n"
350 "%i\n"
351 "%i\n"
352 "%i\n"
353 "%i\n",
354 context->syslog_identifier ?: ident,
355 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
356 context->syslog_priority,
357 !!context->syslog_level_prefix,
358 false,
359 is_kmsg_output(output),
360 is_terminal_output(output)) < 0)
361 return -errno;
362
363 return move_fd(TAKE_FD(fd), nfd, false);
364 }
365
366 static int open_terminal_as(const char *path, int flags, int nfd) {
367 int fd;
368
369 assert(path);
370 assert(nfd >= 0);
371
372 fd = open_terminal(path, flags | O_NOCTTY);
373 if (fd < 0)
374 return fd;
375
376 return move_fd(fd, nfd, false);
377 }
378
379 static int acquire_path(const char *path, int flags, mode_t mode) {
380 union sockaddr_union sa;
381 socklen_t sa_len;
382 _cleanup_close_ int fd = -1;
383 int r;
384
385 assert(path);
386
387 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
388 flags |= O_CREAT;
389
390 fd = open(path, flags|O_NOCTTY, mode);
391 if (fd >= 0)
392 return TAKE_FD(fd);
393
394 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
395 return -errno;
396
397 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
398
399 r = sockaddr_un_set_path(&sa.un, path);
400 if (r < 0)
401 return r == -EINVAL ? -ENXIO : r;
402 sa_len = r;
403
404 fd = socket(AF_UNIX, SOCK_STREAM, 0);
405 if (fd < 0)
406 return -errno;
407
408 if (connect(fd, &sa.sa, sa_len) < 0)
409 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
410 * indication that his wasn't an AF_UNIX socket after all */
411
412 if ((flags & O_ACCMODE) == O_RDONLY)
413 r = shutdown(fd, SHUT_WR);
414 else if ((flags & O_ACCMODE) == O_WRONLY)
415 r = shutdown(fd, SHUT_RD);
416 else
417 r = 0;
418 if (r < 0)
419 return -errno;
420
421 return TAKE_FD(fd);
422 }
423
424 static int fixup_input(
425 const ExecContext *context,
426 int socket_fd,
427 bool apply_tty_stdin) {
428
429 ExecInput std_input;
430
431 assert(context);
432
433 std_input = context->std_input;
434
435 if (is_terminal_input(std_input) && !apply_tty_stdin)
436 return EXEC_INPUT_NULL;
437
438 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
439 return EXEC_INPUT_NULL;
440
441 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
442 return EXEC_INPUT_NULL;
443
444 return std_input;
445 }
446
447 static int fixup_output(ExecOutput std_output, int socket_fd) {
448
449 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
450 return EXEC_OUTPUT_INHERIT;
451
452 return std_output;
453 }
454
455 static int setup_input(
456 const ExecContext *context,
457 const ExecParameters *params,
458 int socket_fd,
459 const int named_iofds[static 3]) {
460
461 ExecInput i;
462
463 assert(context);
464 assert(params);
465 assert(named_iofds);
466
467 if (params->stdin_fd >= 0) {
468 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
469 return -errno;
470
471 /* Try to make this the controlling tty, if it is a tty, and reset it */
472 if (isatty(STDIN_FILENO)) {
473 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
474 (void) reset_terminal_fd(STDIN_FILENO, true);
475 }
476
477 return STDIN_FILENO;
478 }
479
480 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
481
482 switch (i) {
483
484 case EXEC_INPUT_NULL:
485 return open_null_as(O_RDONLY, STDIN_FILENO);
486
487 case EXEC_INPUT_TTY:
488 case EXEC_INPUT_TTY_FORCE:
489 case EXEC_INPUT_TTY_FAIL: {
490 int fd;
491
492 fd = acquire_terminal(exec_context_tty_path(context),
493 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
494 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
495 ACQUIRE_TERMINAL_WAIT,
496 USEC_INFINITY);
497 if (fd < 0)
498 return fd;
499
500 return move_fd(fd, STDIN_FILENO, false);
501 }
502
503 case EXEC_INPUT_SOCKET:
504 assert(socket_fd >= 0);
505
506 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
507
508 case EXEC_INPUT_NAMED_FD:
509 assert(named_iofds[STDIN_FILENO] >= 0);
510
511 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
512 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
513
514 case EXEC_INPUT_DATA: {
515 int fd;
516
517 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
518 if (fd < 0)
519 return fd;
520
521 return move_fd(fd, STDIN_FILENO, false);
522 }
523
524 case EXEC_INPUT_FILE: {
525 bool rw;
526 int fd;
527
528 assert(context->stdio_file[STDIN_FILENO]);
529
530 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
531 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
532
533 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
534 if (fd < 0)
535 return fd;
536
537 return move_fd(fd, STDIN_FILENO, false);
538 }
539
540 default:
541 assert_not_reached("Unknown input type");
542 }
543 }
544
545 static bool can_inherit_stderr_from_stdout(
546 const ExecContext *context,
547 ExecOutput o,
548 ExecOutput e) {
549
550 assert(context);
551
552 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
553 * stderr fd */
554
555 if (e == EXEC_OUTPUT_INHERIT)
556 return true;
557 if (e != o)
558 return false;
559
560 if (e == EXEC_OUTPUT_NAMED_FD)
561 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
562
563 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
564 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
565
566 return true;
567 }
568
569 static int setup_output(
570 const Unit *unit,
571 const ExecContext *context,
572 const ExecParameters *params,
573 int fileno,
574 int socket_fd,
575 const int named_iofds[static 3],
576 const char *ident,
577 uid_t uid,
578 gid_t gid,
579 dev_t *journal_stream_dev,
580 ino_t *journal_stream_ino) {
581
582 ExecOutput o;
583 ExecInput i;
584 int r;
585
586 assert(unit);
587 assert(context);
588 assert(params);
589 assert(ident);
590 assert(journal_stream_dev);
591 assert(journal_stream_ino);
592
593 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
594
595 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
596 return -errno;
597
598 return STDOUT_FILENO;
599 }
600
601 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
602 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
603 return -errno;
604
605 return STDERR_FILENO;
606 }
607
608 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
609 o = fixup_output(context->std_output, socket_fd);
610
611 if (fileno == STDERR_FILENO) {
612 ExecOutput e;
613 e = fixup_output(context->std_error, socket_fd);
614
615 /* This expects the input and output are already set up */
616
617 /* Don't change the stderr file descriptor if we inherit all
618 * the way and are not on a tty */
619 if (e == EXEC_OUTPUT_INHERIT &&
620 o == EXEC_OUTPUT_INHERIT &&
621 i == EXEC_INPUT_NULL &&
622 !is_terminal_input(context->std_input) &&
623 getppid () != 1)
624 return fileno;
625
626 /* Duplicate from stdout if possible */
627 if (can_inherit_stderr_from_stdout(context, o, e))
628 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
629
630 o = e;
631
632 } else if (o == EXEC_OUTPUT_INHERIT) {
633 /* If input got downgraded, inherit the original value */
634 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
635 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
636
637 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
638 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
639 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
640
641 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
642 if (getppid() != 1)
643 return fileno;
644
645 /* We need to open /dev/null here anew, to get the right access mode. */
646 return open_null_as(O_WRONLY, fileno);
647 }
648
649 switch (o) {
650
651 case EXEC_OUTPUT_NULL:
652 return open_null_as(O_WRONLY, fileno);
653
654 case EXEC_OUTPUT_TTY:
655 if (is_terminal_input(i))
656 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
657
658 /* We don't reset the terminal if this is just about output */
659 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
660
661 case EXEC_OUTPUT_KMSG:
662 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
663 case EXEC_OUTPUT_JOURNAL:
664 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
665 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
666 if (r < 0) {
667 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
668 r = open_null_as(O_WRONLY, fileno);
669 } else {
670 struct stat st;
671
672 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
673 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
674 * services to detect whether they are connected to the journal or not.
675 *
676 * If both stdout and stderr are connected to a stream then let's make sure to store the data
677 * about STDERR as that's usually the best way to do logging. */
678
679 if (fstat(fileno, &st) >= 0 &&
680 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
681 *journal_stream_dev = st.st_dev;
682 *journal_stream_ino = st.st_ino;
683 }
684 }
685 return r;
686
687 case EXEC_OUTPUT_SOCKET:
688 assert(socket_fd >= 0);
689
690 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
691
692 case EXEC_OUTPUT_NAMED_FD:
693 assert(named_iofds[fileno] >= 0);
694
695 (void) fd_nonblock(named_iofds[fileno], false);
696 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
697
698 case EXEC_OUTPUT_FILE:
699 case EXEC_OUTPUT_FILE_APPEND: {
700 bool rw;
701 int fd, flags;
702
703 assert(context->stdio_file[fileno]);
704
705 rw = context->std_input == EXEC_INPUT_FILE &&
706 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
707
708 if (rw)
709 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
710
711 flags = O_WRONLY;
712 if (o == EXEC_OUTPUT_FILE_APPEND)
713 flags |= O_APPEND;
714
715 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
716 if (fd < 0)
717 return fd;
718
719 return move_fd(fd, fileno, 0);
720 }
721
722 default:
723 assert_not_reached("Unknown error type");
724 }
725 }
726
727 static int chown_terminal(int fd, uid_t uid) {
728 int r;
729
730 assert(fd >= 0);
731
732 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
733 if (isatty(fd) < 1) {
734 if (IN_SET(errno, EINVAL, ENOTTY))
735 return 0; /* not a tty */
736
737 return -errno;
738 }
739
740 /* This might fail. What matters are the results. */
741 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
742 if (r < 0)
743 return r;
744
745 return 1;
746 }
747
748 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
749 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
750 int r;
751
752 assert(_saved_stdin);
753 assert(_saved_stdout);
754
755 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
756 if (saved_stdin < 0)
757 return -errno;
758
759 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
760 if (saved_stdout < 0)
761 return -errno;
762
763 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
764 if (fd < 0)
765 return fd;
766
767 r = chown_terminal(fd, getuid());
768 if (r < 0)
769 return r;
770
771 r = reset_terminal_fd(fd, true);
772 if (r < 0)
773 return r;
774
775 r = rearrange_stdio(fd, fd, STDERR_FILENO);
776 fd = -1;
777 if (r < 0)
778 return r;
779
780 *_saved_stdin = saved_stdin;
781 *_saved_stdout = saved_stdout;
782
783 saved_stdin = saved_stdout = -1;
784
785 return 0;
786 }
787
788 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
789 assert(err < 0);
790
791 if (err == -ETIMEDOUT)
792 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
793 else {
794 errno = -err;
795 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
796 }
797 }
798
799 static void write_confirm_error(int err, const char *vc, const Unit *u) {
800 _cleanup_close_ int fd = -1;
801
802 assert(vc);
803
804 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
805 if (fd < 0)
806 return;
807
808 write_confirm_error_fd(err, fd, u);
809 }
810
811 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
812 int r = 0;
813
814 assert(saved_stdin);
815 assert(saved_stdout);
816
817 release_terminal();
818
819 if (*saved_stdin >= 0)
820 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
821 r = -errno;
822
823 if (*saved_stdout >= 0)
824 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
825 r = -errno;
826
827 *saved_stdin = safe_close(*saved_stdin);
828 *saved_stdout = safe_close(*saved_stdout);
829
830 return r;
831 }
832
833 enum {
834 CONFIRM_PRETEND_FAILURE = -1,
835 CONFIRM_PRETEND_SUCCESS = 0,
836 CONFIRM_EXECUTE = 1,
837 };
838
839 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
840 int saved_stdout = -1, saved_stdin = -1, r;
841 _cleanup_free_ char *e = NULL;
842 char c;
843
844 /* For any internal errors, assume a positive response. */
845 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
846 if (r < 0) {
847 write_confirm_error(r, vc, u);
848 return CONFIRM_EXECUTE;
849 }
850
851 /* confirm_spawn might have been disabled while we were sleeping. */
852 if (manager_is_confirm_spawn_disabled(u->manager)) {
853 r = 1;
854 goto restore_stdio;
855 }
856
857 e = ellipsize(cmdline, 60, 100);
858 if (!e) {
859 log_oom();
860 r = CONFIRM_EXECUTE;
861 goto restore_stdio;
862 }
863
864 for (;;) {
865 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
866 if (r < 0) {
867 write_confirm_error_fd(r, STDOUT_FILENO, u);
868 r = CONFIRM_EXECUTE;
869 goto restore_stdio;
870 }
871
872 switch (c) {
873 case 'c':
874 printf("Resuming normal execution.\n");
875 manager_disable_confirm_spawn();
876 r = 1;
877 break;
878 case 'D':
879 unit_dump(u, stdout, " ");
880 continue; /* ask again */
881 case 'f':
882 printf("Failing execution.\n");
883 r = CONFIRM_PRETEND_FAILURE;
884 break;
885 case 'h':
886 printf(" c - continue, proceed without asking anymore\n"
887 " D - dump, show the state of the unit\n"
888 " f - fail, don't execute the command and pretend it failed\n"
889 " h - help\n"
890 " i - info, show a short summary of the unit\n"
891 " j - jobs, show jobs that are in progress\n"
892 " s - skip, don't execute the command and pretend it succeeded\n"
893 " y - yes, execute the command\n");
894 continue; /* ask again */
895 case 'i':
896 printf(" Description: %s\n"
897 " Unit: %s\n"
898 " Command: %s\n",
899 u->id, u->description, cmdline);
900 continue; /* ask again */
901 case 'j':
902 manager_dump_jobs(u->manager, stdout, " ");
903 continue; /* ask again */
904 case 'n':
905 /* 'n' was removed in favor of 'f'. */
906 printf("Didn't understand 'n', did you mean 'f'?\n");
907 continue; /* ask again */
908 case 's':
909 printf("Skipping execution.\n");
910 r = CONFIRM_PRETEND_SUCCESS;
911 break;
912 case 'y':
913 r = CONFIRM_EXECUTE;
914 break;
915 default:
916 assert_not_reached("Unhandled choice");
917 }
918 break;
919 }
920
921 restore_stdio:
922 restore_confirm_stdio(&saved_stdin, &saved_stdout);
923 return r;
924 }
925
926 static int get_fixed_user(const ExecContext *c, const char **user,
927 uid_t *uid, gid_t *gid,
928 const char **home, const char **shell) {
929 int r;
930 const char *name;
931
932 assert(c);
933
934 if (!c->user)
935 return 0;
936
937 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
938 * (i.e. are "/" or "/bin/nologin"). */
939
940 name = c->user;
941 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
942 if (r < 0)
943 return r;
944
945 *user = name;
946 return 0;
947 }
948
949 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
950 int r;
951 const char *name;
952
953 assert(c);
954
955 if (!c->group)
956 return 0;
957
958 name = c->group;
959 r = get_group_creds(&name, gid, 0);
960 if (r < 0)
961 return r;
962
963 *group = name;
964 return 0;
965 }
966
967 static int get_supplementary_groups(const ExecContext *c, const char *user,
968 const char *group, gid_t gid,
969 gid_t **supplementary_gids, int *ngids) {
970 char **i;
971 int r, k = 0;
972 int ngroups_max;
973 bool keep_groups = false;
974 gid_t *groups = NULL;
975 _cleanup_free_ gid_t *l_gids = NULL;
976
977 assert(c);
978
979 /*
980 * If user is given, then lookup GID and supplementary groups list.
981 * We avoid NSS lookups for gid=0. Also we have to initialize groups
982 * here and as early as possible so we keep the list of supplementary
983 * groups of the caller.
984 */
985 if (user && gid_is_valid(gid) && gid != 0) {
986 /* First step, initialize groups from /etc/groups */
987 if (initgroups(user, gid) < 0)
988 return -errno;
989
990 keep_groups = true;
991 }
992
993 if (strv_isempty(c->supplementary_groups))
994 return 0;
995
996 /*
997 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
998 * be positive, otherwise fail.
999 */
1000 errno = 0;
1001 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1002 if (ngroups_max <= 0)
1003 return errno_or_else(EOPNOTSUPP);
1004
1005 l_gids = new(gid_t, ngroups_max);
1006 if (!l_gids)
1007 return -ENOMEM;
1008
1009 if (keep_groups) {
1010 /*
1011 * Lookup the list of groups that the user belongs to, we
1012 * avoid NSS lookups here too for gid=0.
1013 */
1014 k = ngroups_max;
1015 if (getgrouplist(user, gid, l_gids, &k) < 0)
1016 return -EINVAL;
1017 } else
1018 k = 0;
1019
1020 STRV_FOREACH(i, c->supplementary_groups) {
1021 const char *g;
1022
1023 if (k >= ngroups_max)
1024 return -E2BIG;
1025
1026 g = *i;
1027 r = get_group_creds(&g, l_gids+k, 0);
1028 if (r < 0)
1029 return r;
1030
1031 k++;
1032 }
1033
1034 /*
1035 * Sets ngids to zero to drop all supplementary groups, happens
1036 * when we are under root and SupplementaryGroups= is empty.
1037 */
1038 if (k == 0) {
1039 *ngids = 0;
1040 return 0;
1041 }
1042
1043 /* Otherwise get the final list of supplementary groups */
1044 groups = memdup(l_gids, sizeof(gid_t) * k);
1045 if (!groups)
1046 return -ENOMEM;
1047
1048 *supplementary_gids = groups;
1049 *ngids = k;
1050
1051 groups = NULL;
1052
1053 return 0;
1054 }
1055
1056 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1057 int r;
1058
1059 /* Handle SupplementaryGroups= if it is not empty */
1060 if (ngids > 0) {
1061 r = maybe_setgroups(ngids, supplementary_gids);
1062 if (r < 0)
1063 return r;
1064 }
1065
1066 if (gid_is_valid(gid)) {
1067 /* Then set our gids */
1068 if (setresgid(gid, gid, gid) < 0)
1069 return -errno;
1070 }
1071
1072 return 0;
1073 }
1074
1075 static int enforce_user(const ExecContext *context, uid_t uid) {
1076 assert(context);
1077
1078 if (!uid_is_valid(uid))
1079 return 0;
1080
1081 /* Sets (but doesn't look up) the uid and make sure we keep the
1082 * capabilities while doing so. */
1083
1084 if (context->capability_ambient_set != 0) {
1085
1086 /* First step: If we need to keep capabilities but
1087 * drop privileges we need to make sure we keep our
1088 * caps, while we drop privileges. */
1089 if (uid != 0) {
1090 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1091
1092 if (prctl(PR_GET_SECUREBITS) != sb)
1093 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1094 return -errno;
1095 }
1096 }
1097
1098 /* Second step: actually set the uids */
1099 if (setresuid(uid, uid, uid) < 0)
1100 return -errno;
1101
1102 /* At this point we should have all necessary capabilities but
1103 are otherwise a normal user. However, the caps might got
1104 corrupted due to the setresuid() so we need clean them up
1105 later. This is done outside of this call. */
1106
1107 return 0;
1108 }
1109
1110 #if HAVE_PAM
1111
1112 static int null_conv(
1113 int num_msg,
1114 const struct pam_message **msg,
1115 struct pam_response **resp,
1116 void *appdata_ptr) {
1117
1118 /* We don't support conversations */
1119
1120 return PAM_CONV_ERR;
1121 }
1122
1123 #endif
1124
1125 static int setup_pam(
1126 const char *name,
1127 const char *user,
1128 uid_t uid,
1129 gid_t gid,
1130 const char *tty,
1131 char ***env,
1132 const int fds[], size_t n_fds) {
1133
1134 #if HAVE_PAM
1135
1136 static const struct pam_conv conv = {
1137 .conv = null_conv,
1138 .appdata_ptr = NULL
1139 };
1140
1141 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1142 pam_handle_t *handle = NULL;
1143 sigset_t old_ss;
1144 int pam_code = PAM_SUCCESS, r;
1145 char **nv, **e = NULL;
1146 bool close_session = false;
1147 pid_t pam_pid = 0, parent_pid;
1148 int flags = 0;
1149
1150 assert(name);
1151 assert(user);
1152 assert(env);
1153
1154 /* We set up PAM in the parent process, then fork. The child
1155 * will then stay around until killed via PR_GET_PDEATHSIG or
1156 * systemd via the cgroup logic. It will then remove the PAM
1157 * session again. The parent process will exec() the actual
1158 * daemon. We do things this way to ensure that the main PID
1159 * of the daemon is the one we initially fork()ed. */
1160
1161 r = barrier_create(&barrier);
1162 if (r < 0)
1163 goto fail;
1164
1165 if (log_get_max_level() < LOG_DEBUG)
1166 flags |= PAM_SILENT;
1167
1168 pam_code = pam_start(name, user, &conv, &handle);
1169 if (pam_code != PAM_SUCCESS) {
1170 handle = NULL;
1171 goto fail;
1172 }
1173
1174 if (!tty) {
1175 _cleanup_free_ char *q = NULL;
1176
1177 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1178 * out if that's the case, and read the TTY off it. */
1179
1180 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1181 tty = strjoina("/dev/", q);
1182 }
1183
1184 if (tty) {
1185 pam_code = pam_set_item(handle, PAM_TTY, tty);
1186 if (pam_code != PAM_SUCCESS)
1187 goto fail;
1188 }
1189
1190 STRV_FOREACH(nv, *env) {
1191 pam_code = pam_putenv(handle, *nv);
1192 if (pam_code != PAM_SUCCESS)
1193 goto fail;
1194 }
1195
1196 pam_code = pam_acct_mgmt(handle, flags);
1197 if (pam_code != PAM_SUCCESS)
1198 goto fail;
1199
1200 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1201 if (pam_code != PAM_SUCCESS)
1202 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1203
1204 pam_code = pam_open_session(handle, flags);
1205 if (pam_code != PAM_SUCCESS)
1206 goto fail;
1207
1208 close_session = true;
1209
1210 e = pam_getenvlist(handle);
1211 if (!e) {
1212 pam_code = PAM_BUF_ERR;
1213 goto fail;
1214 }
1215
1216 /* Block SIGTERM, so that we know that it won't get lost in
1217 * the child */
1218
1219 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1220
1221 parent_pid = getpid_cached();
1222
1223 r = safe_fork("(sd-pam)", 0, &pam_pid);
1224 if (r < 0)
1225 goto fail;
1226 if (r == 0) {
1227 int sig, ret = EXIT_PAM;
1228
1229 /* The child's job is to reset the PAM session on
1230 * termination */
1231 barrier_set_role(&barrier, BARRIER_CHILD);
1232
1233 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1234 * are open here that have been opened by PAM. */
1235 (void) close_many(fds, n_fds);
1236
1237 /* Drop privileges - we don't need any to pam_close_session
1238 * and this will make PR_SET_PDEATHSIG work in most cases.
1239 * If this fails, ignore the error - but expect sd-pam threads
1240 * to fail to exit normally */
1241
1242 r = maybe_setgroups(0, NULL);
1243 if (r < 0)
1244 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1245 if (setresgid(gid, gid, gid) < 0)
1246 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1247 if (setresuid(uid, uid, uid) < 0)
1248 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1249
1250 (void) ignore_signals(SIGPIPE, -1);
1251
1252 /* Wait until our parent died. This will only work if
1253 * the above setresuid() succeeds, otherwise the kernel
1254 * will not allow unprivileged parents kill their privileged
1255 * children this way. We rely on the control groups kill logic
1256 * to do the rest for us. */
1257 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1258 goto child_finish;
1259
1260 /* Tell the parent that our setup is done. This is especially
1261 * important regarding dropping privileges. Otherwise, unit
1262 * setup might race against our setresuid(2) call.
1263 *
1264 * If the parent aborted, we'll detect this below, hence ignore
1265 * return failure here. */
1266 (void) barrier_place(&barrier);
1267
1268 /* Check if our parent process might already have died? */
1269 if (getppid() == parent_pid) {
1270 sigset_t ss;
1271
1272 assert_se(sigemptyset(&ss) >= 0);
1273 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1274
1275 for (;;) {
1276 if (sigwait(&ss, &sig) < 0) {
1277 if (errno == EINTR)
1278 continue;
1279
1280 goto child_finish;
1281 }
1282
1283 assert(sig == SIGTERM);
1284 break;
1285 }
1286 }
1287
1288 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1289 if (pam_code != PAM_SUCCESS)
1290 goto child_finish;
1291
1292 /* If our parent died we'll end the session */
1293 if (getppid() != parent_pid) {
1294 pam_code = pam_close_session(handle, flags);
1295 if (pam_code != PAM_SUCCESS)
1296 goto child_finish;
1297 }
1298
1299 ret = 0;
1300
1301 child_finish:
1302 pam_end(handle, pam_code | flags);
1303 _exit(ret);
1304 }
1305
1306 barrier_set_role(&barrier, BARRIER_PARENT);
1307
1308 /* If the child was forked off successfully it will do all the
1309 * cleanups, so forget about the handle here. */
1310 handle = NULL;
1311
1312 /* Unblock SIGTERM again in the parent */
1313 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1314
1315 /* We close the log explicitly here, since the PAM modules
1316 * might have opened it, but we don't want this fd around. */
1317 closelog();
1318
1319 /* Synchronously wait for the child to initialize. We don't care for
1320 * errors as we cannot recover. However, warn loudly if it happens. */
1321 if (!barrier_place_and_sync(&barrier))
1322 log_error("PAM initialization failed");
1323
1324 return strv_free_and_replace(*env, e);
1325
1326 fail:
1327 if (pam_code != PAM_SUCCESS) {
1328 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1329 r = -EPERM; /* PAM errors do not map to errno */
1330 } else
1331 log_error_errno(r, "PAM failed: %m");
1332
1333 if (handle) {
1334 if (close_session)
1335 pam_code = pam_close_session(handle, flags);
1336
1337 pam_end(handle, pam_code | flags);
1338 }
1339
1340 strv_free(e);
1341 closelog();
1342
1343 return r;
1344 #else
1345 return 0;
1346 #endif
1347 }
1348
1349 static void rename_process_from_path(const char *path) {
1350 char process_name[11];
1351 const char *p;
1352 size_t l;
1353
1354 /* This resulting string must fit in 10 chars (i.e. the length
1355 * of "/sbin/init") to look pretty in /bin/ps */
1356
1357 p = basename(path);
1358 if (isempty(p)) {
1359 rename_process("(...)");
1360 return;
1361 }
1362
1363 l = strlen(p);
1364 if (l > 8) {
1365 /* The end of the process name is usually more
1366 * interesting, since the first bit might just be
1367 * "systemd-" */
1368 p = p + l - 8;
1369 l = 8;
1370 }
1371
1372 process_name[0] = '(';
1373 memcpy(process_name+1, p, l);
1374 process_name[1+l] = ')';
1375 process_name[1+l+1] = 0;
1376
1377 rename_process(process_name);
1378 }
1379
1380 static bool context_has_address_families(const ExecContext *c) {
1381 assert(c);
1382
1383 return c->address_families_allow_list ||
1384 !set_isempty(c->address_families);
1385 }
1386
1387 static bool context_has_syscall_filters(const ExecContext *c) {
1388 assert(c);
1389
1390 return c->syscall_allow_list ||
1391 !hashmap_isempty(c->syscall_filter);
1392 }
1393
1394 static bool context_has_no_new_privileges(const ExecContext *c) {
1395 assert(c);
1396
1397 if (c->no_new_privileges)
1398 return true;
1399
1400 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1401 return false;
1402
1403 /* We need NNP if we have any form of seccomp and are unprivileged */
1404 return context_has_address_families(c) ||
1405 c->memory_deny_write_execute ||
1406 c->restrict_realtime ||
1407 c->restrict_suid_sgid ||
1408 exec_context_restrict_namespaces_set(c) ||
1409 c->protect_clock ||
1410 c->protect_kernel_tunables ||
1411 c->protect_kernel_modules ||
1412 c->protect_kernel_logs ||
1413 c->private_devices ||
1414 context_has_syscall_filters(c) ||
1415 !set_isempty(c->syscall_archs) ||
1416 c->lock_personality ||
1417 c->protect_hostname;
1418 }
1419
1420 #if HAVE_SECCOMP
1421
1422 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1423
1424 if (is_seccomp_available())
1425 return false;
1426
1427 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1428 return true;
1429 }
1430
1431 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1432 uint32_t negative_action, default_action, action;
1433 int r;
1434
1435 assert(u);
1436 assert(c);
1437
1438 if (!context_has_syscall_filters(c))
1439 return 0;
1440
1441 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1442 return 0;
1443
1444 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1445
1446 if (c->syscall_allow_list) {
1447 default_action = negative_action;
1448 action = SCMP_ACT_ALLOW;
1449 } else {
1450 default_action = SCMP_ACT_ALLOW;
1451 action = negative_action;
1452 }
1453
1454 if (needs_ambient_hack) {
1455 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1456 if (r < 0)
1457 return r;
1458 }
1459
1460 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1461 }
1462
1463 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1464 assert(u);
1465 assert(c);
1466
1467 if (set_isempty(c->syscall_archs))
1468 return 0;
1469
1470 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1471 return 0;
1472
1473 return seccomp_restrict_archs(c->syscall_archs);
1474 }
1475
1476 static int apply_address_families(const Unit* u, const ExecContext *c) {
1477 assert(u);
1478 assert(c);
1479
1480 if (!context_has_address_families(c))
1481 return 0;
1482
1483 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1484 return 0;
1485
1486 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1487 }
1488
1489 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1490 assert(u);
1491 assert(c);
1492
1493 if (!c->memory_deny_write_execute)
1494 return 0;
1495
1496 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1497 return 0;
1498
1499 return seccomp_memory_deny_write_execute();
1500 }
1501
1502 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1503 assert(u);
1504 assert(c);
1505
1506 if (!c->restrict_realtime)
1507 return 0;
1508
1509 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1510 return 0;
1511
1512 return seccomp_restrict_realtime();
1513 }
1514
1515 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1516 assert(u);
1517 assert(c);
1518
1519 if (!c->restrict_suid_sgid)
1520 return 0;
1521
1522 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1523 return 0;
1524
1525 return seccomp_restrict_suid_sgid();
1526 }
1527
1528 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1529 assert(u);
1530 assert(c);
1531
1532 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1533 * let's protect even those systems where this is left on in the kernel. */
1534
1535 if (!c->protect_kernel_tunables)
1536 return 0;
1537
1538 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1539 return 0;
1540
1541 return seccomp_protect_sysctl();
1542 }
1543
1544 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1545 assert(u);
1546 assert(c);
1547
1548 /* Turn off module syscalls on ProtectKernelModules=yes */
1549
1550 if (!c->protect_kernel_modules)
1551 return 0;
1552
1553 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1554 return 0;
1555
1556 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1557 }
1558
1559 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1560 assert(u);
1561 assert(c);
1562
1563 if (!c->protect_kernel_logs)
1564 return 0;
1565
1566 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1567 return 0;
1568
1569 return seccomp_protect_syslog();
1570 }
1571
1572 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1573 assert(u);
1574 assert(c);
1575
1576 if (!c->protect_clock)
1577 return 0;
1578
1579 if (skip_seccomp_unavailable(u, "ProtectClock="))
1580 return 0;
1581
1582 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1583 }
1584
1585 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1586 assert(u);
1587 assert(c);
1588
1589 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1590
1591 if (!c->private_devices)
1592 return 0;
1593
1594 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1595 return 0;
1596
1597 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1598 }
1599
1600 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1601 assert(u);
1602 assert(c);
1603
1604 if (!exec_context_restrict_namespaces_set(c))
1605 return 0;
1606
1607 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1608 return 0;
1609
1610 return seccomp_restrict_namespaces(c->restrict_namespaces);
1611 }
1612
1613 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1614 unsigned long personality;
1615 int r;
1616
1617 assert(u);
1618 assert(c);
1619
1620 if (!c->lock_personality)
1621 return 0;
1622
1623 if (skip_seccomp_unavailable(u, "LockPersonality="))
1624 return 0;
1625
1626 personality = c->personality;
1627
1628 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1629 if (personality == PERSONALITY_INVALID) {
1630
1631 r = opinionated_personality(&personality);
1632 if (r < 0)
1633 return r;
1634 }
1635
1636 return seccomp_lock_personality(personality);
1637 }
1638
1639 #endif
1640
1641 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1642 assert(u);
1643 assert(c);
1644
1645 if (!c->protect_hostname)
1646 return 0;
1647
1648 if (ns_type_supported(NAMESPACE_UTS)) {
1649 if (unshare(CLONE_NEWUTS) < 0) {
1650 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1651 *ret_exit_status = EXIT_NAMESPACE;
1652 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1653 }
1654
1655 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1656 }
1657 } else
1658 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1659
1660 #if HAVE_SECCOMP
1661 int r;
1662
1663 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1664 return 0;
1665
1666 r = seccomp_protect_hostname();
1667 if (r < 0) {
1668 *ret_exit_status = EXIT_SECCOMP;
1669 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1670 }
1671 #endif
1672
1673 return 0;
1674 }
1675
1676 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1677 assert(idle_pipe);
1678
1679 idle_pipe[1] = safe_close(idle_pipe[1]);
1680 idle_pipe[2] = safe_close(idle_pipe[2]);
1681
1682 if (idle_pipe[0] >= 0) {
1683 int r;
1684
1685 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1686
1687 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1688 ssize_t n;
1689
1690 /* Signal systemd that we are bored and want to continue. */
1691 n = write(idle_pipe[3], "x", 1);
1692 if (n > 0)
1693 /* Wait for systemd to react to the signal above. */
1694 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1695 }
1696
1697 idle_pipe[0] = safe_close(idle_pipe[0]);
1698
1699 }
1700
1701 idle_pipe[3] = safe_close(idle_pipe[3]);
1702 }
1703
1704 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1705
1706 static int build_environment(
1707 const Unit *u,
1708 const ExecContext *c,
1709 const ExecParameters *p,
1710 size_t n_fds,
1711 const char *home,
1712 const char *username,
1713 const char *shell,
1714 dev_t journal_stream_dev,
1715 ino_t journal_stream_ino,
1716 char ***ret) {
1717
1718 _cleanup_strv_free_ char **our_env = NULL;
1719 ExecDirectoryType t;
1720 size_t n_env = 0;
1721 char *x;
1722
1723 assert(u);
1724 assert(c);
1725 assert(p);
1726 assert(ret);
1727
1728 #define N_ENV_VARS 15
1729 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1730 if (!our_env)
1731 return -ENOMEM;
1732
1733 if (n_fds > 0) {
1734 _cleanup_free_ char *joined = NULL;
1735
1736 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1737 return -ENOMEM;
1738 our_env[n_env++] = x;
1739
1740 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1741 return -ENOMEM;
1742 our_env[n_env++] = x;
1743
1744 joined = strv_join(p->fd_names, ":");
1745 if (!joined)
1746 return -ENOMEM;
1747
1748 x = strjoin("LISTEN_FDNAMES=", joined);
1749 if (!x)
1750 return -ENOMEM;
1751 our_env[n_env++] = x;
1752 }
1753
1754 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1755 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1756 return -ENOMEM;
1757 our_env[n_env++] = x;
1758
1759 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1760 return -ENOMEM;
1761 our_env[n_env++] = x;
1762 }
1763
1764 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1765 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1766 * check the database directly. */
1767 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1768 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1769 if (!x)
1770 return -ENOMEM;
1771 our_env[n_env++] = x;
1772 }
1773
1774 if (home) {
1775 x = strjoin("HOME=", home);
1776 if (!x)
1777 return -ENOMEM;
1778
1779 path_simplify(x + 5, true);
1780 our_env[n_env++] = x;
1781 }
1782
1783 if (username) {
1784 x = strjoin("LOGNAME=", username);
1785 if (!x)
1786 return -ENOMEM;
1787 our_env[n_env++] = x;
1788
1789 x = strjoin("USER=", username);
1790 if (!x)
1791 return -ENOMEM;
1792 our_env[n_env++] = x;
1793 }
1794
1795 if (shell) {
1796 x = strjoin("SHELL=", shell);
1797 if (!x)
1798 return -ENOMEM;
1799
1800 path_simplify(x + 6, true);
1801 our_env[n_env++] = x;
1802 }
1803
1804 if (!sd_id128_is_null(u->invocation_id)) {
1805 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1806 return -ENOMEM;
1807
1808 our_env[n_env++] = x;
1809 }
1810
1811 if (exec_context_needs_term(c)) {
1812 const char *tty_path, *term = NULL;
1813
1814 tty_path = exec_context_tty_path(c);
1815
1816 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1817 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1818 * container manager passes to PID 1 ends up all the way in the console login shown. */
1819
1820 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1821 term = getenv("TERM");
1822
1823 if (!term)
1824 term = default_term_for_tty(tty_path);
1825
1826 x = strjoin("TERM=", term);
1827 if (!x)
1828 return -ENOMEM;
1829 our_env[n_env++] = x;
1830 }
1831
1832 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1833 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1834 return -ENOMEM;
1835
1836 our_env[n_env++] = x;
1837 }
1838
1839 if (c->log_namespace) {
1840 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1841 if (!x)
1842 return -ENOMEM;
1843
1844 our_env[n_env++] = x;
1845 }
1846
1847 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1848 _cleanup_free_ char *pre = NULL, *joined = NULL;
1849 const char *n;
1850
1851 if (!p->prefix[t])
1852 continue;
1853
1854 if (strv_isempty(c->directories[t].paths))
1855 continue;
1856
1857 n = exec_directory_env_name_to_string(t);
1858 if (!n)
1859 continue;
1860
1861 pre = strjoin(p->prefix[t], "/");
1862 if (!pre)
1863 return -ENOMEM;
1864
1865 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1866 if (!joined)
1867 return -ENOMEM;
1868
1869 x = strjoin(n, "=", joined);
1870 if (!x)
1871 return -ENOMEM;
1872
1873 our_env[n_env++] = x;
1874 }
1875
1876 our_env[n_env++] = NULL;
1877 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1878 #undef N_ENV_VARS
1879
1880 *ret = TAKE_PTR(our_env);
1881
1882 return 0;
1883 }
1884
1885 static int build_pass_environment(const ExecContext *c, char ***ret) {
1886 _cleanup_strv_free_ char **pass_env = NULL;
1887 size_t n_env = 0, n_bufsize = 0;
1888 char **i;
1889
1890 STRV_FOREACH(i, c->pass_environment) {
1891 _cleanup_free_ char *x = NULL;
1892 char *v;
1893
1894 v = getenv(*i);
1895 if (!v)
1896 continue;
1897 x = strjoin(*i, "=", v);
1898 if (!x)
1899 return -ENOMEM;
1900
1901 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1902 return -ENOMEM;
1903
1904 pass_env[n_env++] = TAKE_PTR(x);
1905 pass_env[n_env] = NULL;
1906 }
1907
1908 *ret = TAKE_PTR(pass_env);
1909
1910 return 0;
1911 }
1912
1913 static bool exec_needs_mount_namespace(
1914 const ExecContext *context,
1915 const ExecParameters *params,
1916 const ExecRuntime *runtime) {
1917
1918 assert(context);
1919 assert(params);
1920
1921 if (context->root_image)
1922 return true;
1923
1924 if (!strv_isempty(context->read_write_paths) ||
1925 !strv_isempty(context->read_only_paths) ||
1926 !strv_isempty(context->inaccessible_paths))
1927 return true;
1928
1929 if (context->n_bind_mounts > 0)
1930 return true;
1931
1932 if (context->n_temporary_filesystems > 0)
1933 return true;
1934
1935 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1936 return true;
1937
1938 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1939 return true;
1940
1941 if (context->private_devices ||
1942 context->private_mounts ||
1943 context->protect_system != PROTECT_SYSTEM_NO ||
1944 context->protect_home != PROTECT_HOME_NO ||
1945 context->protect_kernel_tunables ||
1946 context->protect_kernel_modules ||
1947 context->protect_kernel_logs ||
1948 context->protect_control_groups)
1949 return true;
1950
1951 if (context->root_directory) {
1952 ExecDirectoryType t;
1953
1954 if (context->mount_apivfs)
1955 return true;
1956
1957 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1958 if (!params->prefix[t])
1959 continue;
1960
1961 if (!strv_isempty(context->directories[t].paths))
1962 return true;
1963 }
1964 }
1965
1966 if (context->dynamic_user &&
1967 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1968 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1969 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1970 return true;
1971
1972 if (context->log_namespace)
1973 return true;
1974
1975 return false;
1976 }
1977
1978 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1979 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1980 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1981 _cleanup_close_ int unshare_ready_fd = -1;
1982 _cleanup_(sigkill_waitp) pid_t pid = 0;
1983 uint64_t c = 1;
1984 ssize_t n;
1985 int r;
1986
1987 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1988 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1989 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1990 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1991 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1992 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1993 * continues execution normally.
1994 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
1995 * does not need CAP_SETUID to write the single line mapping to itself. */
1996
1997 /* Can only set up multiple mappings with CAP_SETUID. */
1998 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
1999 r = asprintf(&uid_map,
2000 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2001 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2002 ouid, ouid, uid, uid);
2003 else
2004 r = asprintf(&uid_map,
2005 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2006 ouid, ouid);
2007
2008 if (r < 0)
2009 return -ENOMEM;
2010
2011 /* Can only set up multiple mappings with CAP_SETGID. */
2012 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2013 r = asprintf(&gid_map,
2014 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2015 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2016 ogid, ogid, gid, gid);
2017 else
2018 r = asprintf(&gid_map,
2019 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2020 ogid, ogid);
2021
2022 if (r < 0)
2023 return -ENOMEM;
2024
2025 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2026 * namespace. */
2027 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2028 if (unshare_ready_fd < 0)
2029 return -errno;
2030
2031 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2032 * failed. */
2033 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2034 return -errno;
2035
2036 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2037 if (r < 0)
2038 return r;
2039 if (r == 0) {
2040 _cleanup_close_ int fd = -1;
2041 const char *a;
2042 pid_t ppid;
2043
2044 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2045 * here, after the parent opened its own user namespace. */
2046
2047 ppid = getppid();
2048 errno_pipe[0] = safe_close(errno_pipe[0]);
2049
2050 /* Wait until the parent unshared the user namespace */
2051 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2052 r = -errno;
2053 goto child_fail;
2054 }
2055
2056 /* Disable the setgroups() system call in the child user namespace, for good. */
2057 a = procfs_file_alloca(ppid, "setgroups");
2058 fd = open(a, O_WRONLY|O_CLOEXEC);
2059 if (fd < 0) {
2060 if (errno != ENOENT) {
2061 r = -errno;
2062 goto child_fail;
2063 }
2064
2065 /* If the file is missing the kernel is too old, let's continue anyway. */
2066 } else {
2067 if (write(fd, "deny\n", 5) < 0) {
2068 r = -errno;
2069 goto child_fail;
2070 }
2071
2072 fd = safe_close(fd);
2073 }
2074
2075 /* First write the GID map */
2076 a = procfs_file_alloca(ppid, "gid_map");
2077 fd = open(a, O_WRONLY|O_CLOEXEC);
2078 if (fd < 0) {
2079 r = -errno;
2080 goto child_fail;
2081 }
2082 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2083 r = -errno;
2084 goto child_fail;
2085 }
2086 fd = safe_close(fd);
2087
2088 /* The write the UID map */
2089 a = procfs_file_alloca(ppid, "uid_map");
2090 fd = open(a, O_WRONLY|O_CLOEXEC);
2091 if (fd < 0) {
2092 r = -errno;
2093 goto child_fail;
2094 }
2095 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2096 r = -errno;
2097 goto child_fail;
2098 }
2099
2100 _exit(EXIT_SUCCESS);
2101
2102 child_fail:
2103 (void) write(errno_pipe[1], &r, sizeof(r));
2104 _exit(EXIT_FAILURE);
2105 }
2106
2107 errno_pipe[1] = safe_close(errno_pipe[1]);
2108
2109 if (unshare(CLONE_NEWUSER) < 0)
2110 return -errno;
2111
2112 /* Let the child know that the namespace is ready now */
2113 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2114 return -errno;
2115
2116 /* Try to read an error code from the child */
2117 n = read(errno_pipe[0], &r, sizeof(r));
2118 if (n < 0)
2119 return -errno;
2120 if (n == sizeof(r)) { /* an error code was sent to us */
2121 if (r < 0)
2122 return r;
2123 return -EIO;
2124 }
2125 if (n != 0) /* on success we should have read 0 bytes */
2126 return -EIO;
2127
2128 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2129 pid = 0;
2130 if (r < 0)
2131 return r;
2132 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2133 return -EIO;
2134
2135 return 0;
2136 }
2137
2138 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2139 if (!context->dynamic_user)
2140 return false;
2141
2142 if (type == EXEC_DIRECTORY_CONFIGURATION)
2143 return false;
2144
2145 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2146 return false;
2147
2148 return true;
2149 }
2150
2151 static int setup_exec_directory(
2152 const ExecContext *context,
2153 const ExecParameters *params,
2154 uid_t uid,
2155 gid_t gid,
2156 ExecDirectoryType type,
2157 int *exit_status) {
2158
2159 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2160 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2161 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2162 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2163 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2164 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2165 };
2166 char **rt;
2167 int r;
2168
2169 assert(context);
2170 assert(params);
2171 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2172 assert(exit_status);
2173
2174 if (!params->prefix[type])
2175 return 0;
2176
2177 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2178 if (!uid_is_valid(uid))
2179 uid = 0;
2180 if (!gid_is_valid(gid))
2181 gid = 0;
2182 }
2183
2184 STRV_FOREACH(rt, context->directories[type].paths) {
2185 _cleanup_free_ char *p = NULL, *pp = NULL;
2186
2187 p = path_join(params->prefix[type], *rt);
2188 if (!p) {
2189 r = -ENOMEM;
2190 goto fail;
2191 }
2192
2193 r = mkdir_parents_label(p, 0755);
2194 if (r < 0)
2195 goto fail;
2196
2197 if (exec_directory_is_private(context, type)) {
2198 _cleanup_free_ char *private_root = NULL;
2199
2200 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2201 * case we want to avoid leaving a directory around fully accessible that is owned by
2202 * a dynamic user whose UID is later on reused. To lock this down we use the same
2203 * trick used by container managers to prohibit host users to get access to files of
2204 * the same UID in containers: we place everything inside a directory that has an
2205 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2206 * for unprivileged host code. We then use fs namespacing to make this directory
2207 * permeable for the service itself.
2208 *
2209 * Specifically: for a service which wants a special directory "foo/" we first create
2210 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2211 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2212 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2213 * unprivileged host users can't look into it. Inside of the namespace of the unit
2214 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2215 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2216 * for the service and making sure it only gets access to the dirs it needs but no
2217 * others. Tricky? Yes, absolutely, but it works!
2218 *
2219 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2220 * to be owned by the service itself.
2221 *
2222 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2223 * for sharing files or sockets with other services. */
2224
2225 private_root = path_join(params->prefix[type], "private");
2226 if (!private_root) {
2227 r = -ENOMEM;
2228 goto fail;
2229 }
2230
2231 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2232 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2233 if (r < 0)
2234 goto fail;
2235
2236 pp = path_join(private_root, *rt);
2237 if (!pp) {
2238 r = -ENOMEM;
2239 goto fail;
2240 }
2241
2242 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2243 r = mkdir_parents_label(pp, 0755);
2244 if (r < 0)
2245 goto fail;
2246
2247 if (is_dir(p, false) > 0 &&
2248 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2249
2250 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2251 * it over. Most likely the service has been upgraded from one that didn't use
2252 * DynamicUser=1, to one that does. */
2253
2254 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2255 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2256 exec_directory_type_to_string(type), p, pp);
2257
2258 if (rename(p, pp) < 0) {
2259 r = -errno;
2260 goto fail;
2261 }
2262 } else {
2263 /* Otherwise, create the actual directory for the service */
2264
2265 r = mkdir_label(pp, context->directories[type].mode);
2266 if (r < 0 && r != -EEXIST)
2267 goto fail;
2268 }
2269
2270 /* And link it up from the original place */
2271 r = symlink_idempotent(pp, p, true);
2272 if (r < 0)
2273 goto fail;
2274
2275 } else {
2276 _cleanup_free_ char *target = NULL;
2277
2278 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2279 readlink_and_make_absolute(p, &target) >= 0) {
2280 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2281
2282 /* This already exists and is a symlink? Interesting. Maybe it's one created
2283 * by DynamicUser=1 (see above)?
2284 *
2285 * We do this for all directory types except for ConfigurationDirectory=,
2286 * since they all support the private/ symlink logic at least in some
2287 * configurations, see above. */
2288
2289 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2290 if (r < 0)
2291 goto fail;
2292
2293 q = path_join(params->prefix[type], "private", *rt);
2294 if (!q) {
2295 r = -ENOMEM;
2296 goto fail;
2297 }
2298
2299 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2300 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2301 if (r < 0)
2302 goto fail;
2303
2304 if (path_equal(q_resolved, target_resolved)) {
2305
2306 /* Hmm, apparently DynamicUser= was once turned on for this service,
2307 * but is no longer. Let's move the directory back up. */
2308
2309 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2310 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2311 exec_directory_type_to_string(type), q, p);
2312
2313 if (unlink(p) < 0) {
2314 r = -errno;
2315 goto fail;
2316 }
2317
2318 if (rename(q, p) < 0) {
2319 r = -errno;
2320 goto fail;
2321 }
2322 }
2323 }
2324
2325 r = mkdir_label(p, context->directories[type].mode);
2326 if (r < 0) {
2327 if (r != -EEXIST)
2328 goto fail;
2329
2330 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2331 struct stat st;
2332
2333 /* Don't change the owner/access mode of the configuration directory,
2334 * as in the common case it is not written to by a service, and shall
2335 * not be writable. */
2336
2337 if (stat(p, &st) < 0) {
2338 r = -errno;
2339 goto fail;
2340 }
2341
2342 /* Still complain if the access mode doesn't match */
2343 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2344 log_warning("%s \'%s\' already exists but the mode is different. "
2345 "(File system: %o %sMode: %o)",
2346 exec_directory_type_to_string(type), *rt,
2347 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2348
2349 continue;
2350 }
2351 }
2352 }
2353
2354 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2355 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2356 * current UID/GID ownership.) */
2357 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2358 if (r < 0)
2359 goto fail;
2360
2361 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2362 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2363 * assignments to exist.*/
2364 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2365 if (r < 0)
2366 goto fail;
2367 }
2368
2369 return 0;
2370
2371 fail:
2372 *exit_status = exit_status_table[type];
2373 return r;
2374 }
2375
2376 #if ENABLE_SMACK
2377 static int setup_smack(
2378 const ExecContext *context,
2379 const ExecCommand *command) {
2380
2381 int r;
2382
2383 assert(context);
2384 assert(command);
2385
2386 if (context->smack_process_label) {
2387 r = mac_smack_apply_pid(0, context->smack_process_label);
2388 if (r < 0)
2389 return r;
2390 }
2391 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2392 else {
2393 _cleanup_free_ char *exec_label = NULL;
2394
2395 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2396 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2397 return r;
2398
2399 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2400 if (r < 0)
2401 return r;
2402 }
2403 #endif
2404
2405 return 0;
2406 }
2407 #endif
2408
2409 static int compile_bind_mounts(
2410 const ExecContext *context,
2411 const ExecParameters *params,
2412 BindMount **ret_bind_mounts,
2413 size_t *ret_n_bind_mounts,
2414 char ***ret_empty_directories) {
2415
2416 _cleanup_strv_free_ char **empty_directories = NULL;
2417 BindMount *bind_mounts;
2418 size_t n, h = 0, i;
2419 ExecDirectoryType t;
2420 int r;
2421
2422 assert(context);
2423 assert(params);
2424 assert(ret_bind_mounts);
2425 assert(ret_n_bind_mounts);
2426 assert(ret_empty_directories);
2427
2428 n = context->n_bind_mounts;
2429 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2430 if (!params->prefix[t])
2431 continue;
2432
2433 n += strv_length(context->directories[t].paths);
2434 }
2435
2436 if (n <= 0) {
2437 *ret_bind_mounts = NULL;
2438 *ret_n_bind_mounts = 0;
2439 *ret_empty_directories = NULL;
2440 return 0;
2441 }
2442
2443 bind_mounts = new(BindMount, n);
2444 if (!bind_mounts)
2445 return -ENOMEM;
2446
2447 for (i = 0; i < context->n_bind_mounts; i++) {
2448 BindMount *item = context->bind_mounts + i;
2449 char *s, *d;
2450
2451 s = strdup(item->source);
2452 if (!s) {
2453 r = -ENOMEM;
2454 goto finish;
2455 }
2456
2457 d = strdup(item->destination);
2458 if (!d) {
2459 free(s);
2460 r = -ENOMEM;
2461 goto finish;
2462 }
2463
2464 bind_mounts[h++] = (BindMount) {
2465 .source = s,
2466 .destination = d,
2467 .read_only = item->read_only,
2468 .recursive = item->recursive,
2469 .ignore_enoent = item->ignore_enoent,
2470 };
2471 }
2472
2473 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2474 char **suffix;
2475
2476 if (!params->prefix[t])
2477 continue;
2478
2479 if (strv_isempty(context->directories[t].paths))
2480 continue;
2481
2482 if (exec_directory_is_private(context, t) &&
2483 !(context->root_directory || context->root_image)) {
2484 char *private_root;
2485
2486 /* So this is for a dynamic user, and we need to make sure the process can access its own
2487 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2488 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2489
2490 private_root = path_join(params->prefix[t], "private");
2491 if (!private_root) {
2492 r = -ENOMEM;
2493 goto finish;
2494 }
2495
2496 r = strv_consume(&empty_directories, private_root);
2497 if (r < 0)
2498 goto finish;
2499 }
2500
2501 STRV_FOREACH(suffix, context->directories[t].paths) {
2502 char *s, *d;
2503
2504 if (exec_directory_is_private(context, t))
2505 s = path_join(params->prefix[t], "private", *suffix);
2506 else
2507 s = path_join(params->prefix[t], *suffix);
2508 if (!s) {
2509 r = -ENOMEM;
2510 goto finish;
2511 }
2512
2513 if (exec_directory_is_private(context, t) &&
2514 (context->root_directory || context->root_image))
2515 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2516 * directory is not created on the root directory. So, let's bind-mount the directory
2517 * on the 'non-private' place. */
2518 d = path_join(params->prefix[t], *suffix);
2519 else
2520 d = strdup(s);
2521 if (!d) {
2522 free(s);
2523 r = -ENOMEM;
2524 goto finish;
2525 }
2526
2527 bind_mounts[h++] = (BindMount) {
2528 .source = s,
2529 .destination = d,
2530 .read_only = false,
2531 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2532 .recursive = true,
2533 .ignore_enoent = false,
2534 };
2535 }
2536 }
2537
2538 assert(h == n);
2539
2540 *ret_bind_mounts = bind_mounts;
2541 *ret_n_bind_mounts = n;
2542 *ret_empty_directories = TAKE_PTR(empty_directories);
2543
2544 return (int) n;
2545
2546 finish:
2547 bind_mount_free_many(bind_mounts, h);
2548 return r;
2549 }
2550
2551 static bool insist_on_sandboxing(
2552 const ExecContext *context,
2553 const char *root_dir,
2554 const char *root_image,
2555 const BindMount *bind_mounts,
2556 size_t n_bind_mounts) {
2557
2558 size_t i;
2559
2560 assert(context);
2561 assert(n_bind_mounts == 0 || bind_mounts);
2562
2563 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2564 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2565 * rearrange stuff in a way we cannot ignore gracefully. */
2566
2567 if (context->n_temporary_filesystems > 0)
2568 return true;
2569
2570 if (root_dir || root_image)
2571 return true;
2572
2573 if (context->dynamic_user)
2574 return true;
2575
2576 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2577 * essential. */
2578 for (i = 0; i < n_bind_mounts; i++)
2579 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2580 return true;
2581
2582 if (context->log_namespace)
2583 return true;
2584
2585 return false;
2586 }
2587
2588 static int apply_mount_namespace(
2589 const Unit *u,
2590 const ExecCommand *command,
2591 const ExecContext *context,
2592 const ExecParameters *params,
2593 const ExecRuntime *runtime,
2594 char **error_path) {
2595
2596 _cleanup_strv_free_ char **empty_directories = NULL;
2597 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
2598 const char *root_dir = NULL, *root_image = NULL;
2599 NamespaceInfo ns_info;
2600 bool needs_sandboxing;
2601 BindMount *bind_mounts = NULL;
2602 size_t n_bind_mounts = 0;
2603 int r;
2604
2605 assert(context);
2606
2607 if (params->flags & EXEC_APPLY_CHROOT) {
2608 root_image = context->root_image;
2609
2610 if (!root_image)
2611 root_dir = context->root_directory;
2612 }
2613
2614 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2615 if (r < 0)
2616 return r;
2617
2618 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2619 if (needs_sandboxing) {
2620 /* The runtime struct only contains the parent of the private /tmp,
2621 * which is non-accessible to world users. Inside of it there's a /tmp
2622 * that is sticky, and that's the one we want to use here.
2623 * This does not apply when we are using /run/systemd/empty as fallback. */
2624
2625 if (context->private_tmp && runtime) {
2626 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
2627 tmp_dir = runtime->tmp_dir;
2628 else if (runtime->tmp_dir)
2629 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
2630
2631 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
2632 var_tmp_dir = runtime->var_tmp_dir;
2633 else if (runtime->tmp_dir)
2634 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
2635 }
2636
2637 ns_info = (NamespaceInfo) {
2638 .ignore_protect_paths = false,
2639 .private_dev = context->private_devices,
2640 .protect_control_groups = context->protect_control_groups,
2641 .protect_kernel_tunables = context->protect_kernel_tunables,
2642 .protect_kernel_modules = context->protect_kernel_modules,
2643 .protect_kernel_logs = context->protect_kernel_logs,
2644 .protect_hostname = context->protect_hostname,
2645 .mount_apivfs = context->mount_apivfs,
2646 .private_mounts = context->private_mounts,
2647 };
2648 } else if (!context->dynamic_user && root_dir)
2649 /*
2650 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2651 * sandbox info, otherwise enforce it, don't ignore protected paths and
2652 * fail if we are enable to apply the sandbox inside the mount namespace.
2653 */
2654 ns_info = (NamespaceInfo) {
2655 .ignore_protect_paths = true,
2656 };
2657 else
2658 ns_info = (NamespaceInfo) {};
2659
2660 if (context->mount_flags == MS_SHARED)
2661 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2662
2663 r = setup_namespace(root_dir, root_image,
2664 &ns_info, context->read_write_paths,
2665 needs_sandboxing ? context->read_only_paths : NULL,
2666 needs_sandboxing ? context->inaccessible_paths : NULL,
2667 empty_directories,
2668 bind_mounts,
2669 n_bind_mounts,
2670 context->temporary_filesystems,
2671 context->n_temporary_filesystems,
2672 tmp_dir,
2673 var_tmp_dir,
2674 context->log_namespace,
2675 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2676 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2677 context->mount_flags,
2678 context->root_hash, context->root_hash_size, context->root_hash_path,
2679 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
2680 context->root_verity,
2681 DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2682 error_path);
2683
2684 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2685 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2686 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2687 * completely different execution environment. */
2688 if (r == -ENOANO) {
2689 if (insist_on_sandboxing(
2690 context,
2691 root_dir, root_image,
2692 bind_mounts,
2693 n_bind_mounts)) {
2694 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2695 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2696 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2697
2698 r = -EOPNOTSUPP;
2699 } else {
2700 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2701 r = 0;
2702 }
2703 }
2704
2705 bind_mount_free_many(bind_mounts, n_bind_mounts);
2706 return r;
2707 }
2708
2709 static int apply_working_directory(
2710 const ExecContext *context,
2711 const ExecParameters *params,
2712 const char *home,
2713 int *exit_status) {
2714
2715 const char *d, *wd;
2716
2717 assert(context);
2718 assert(exit_status);
2719
2720 if (context->working_directory_home) {
2721
2722 if (!home) {
2723 *exit_status = EXIT_CHDIR;
2724 return -ENXIO;
2725 }
2726
2727 wd = home;
2728
2729 } else if (context->working_directory)
2730 wd = context->working_directory;
2731 else
2732 wd = "/";
2733
2734 if (params->flags & EXEC_APPLY_CHROOT)
2735 d = wd;
2736 else
2737 d = prefix_roota(context->root_directory, wd);
2738
2739 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2740 *exit_status = EXIT_CHDIR;
2741 return -errno;
2742 }
2743
2744 return 0;
2745 }
2746
2747 static int apply_root_directory(
2748 const ExecContext *context,
2749 const ExecParameters *params,
2750 const bool needs_mount_ns,
2751 int *exit_status) {
2752
2753 assert(context);
2754 assert(exit_status);
2755
2756 if (params->flags & EXEC_APPLY_CHROOT) {
2757 if (!needs_mount_ns && context->root_directory)
2758 if (chroot(context->root_directory) < 0) {
2759 *exit_status = EXIT_CHROOT;
2760 return -errno;
2761 }
2762 }
2763
2764 return 0;
2765 }
2766
2767 static int setup_keyring(
2768 const Unit *u,
2769 const ExecContext *context,
2770 const ExecParameters *p,
2771 uid_t uid, gid_t gid) {
2772
2773 key_serial_t keyring;
2774 int r = 0;
2775 uid_t saved_uid;
2776 gid_t saved_gid;
2777
2778 assert(u);
2779 assert(context);
2780 assert(p);
2781
2782 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2783 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2784 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2785 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2786 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2787 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2788
2789 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2790 return 0;
2791
2792 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2793 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2794 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2795 * & group is just as nasty as acquiring a reference to the user keyring. */
2796
2797 saved_uid = getuid();
2798 saved_gid = getgid();
2799
2800 if (gid_is_valid(gid) && gid != saved_gid) {
2801 if (setregid(gid, -1) < 0)
2802 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2803 }
2804
2805 if (uid_is_valid(uid) && uid != saved_uid) {
2806 if (setreuid(uid, -1) < 0) {
2807 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2808 goto out;
2809 }
2810 }
2811
2812 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2813 if (keyring == -1) {
2814 if (errno == ENOSYS)
2815 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2816 else if (IN_SET(errno, EACCES, EPERM))
2817 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2818 else if (errno == EDQUOT)
2819 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2820 else
2821 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2822
2823 goto out;
2824 }
2825
2826 /* When requested link the user keyring into the session keyring. */
2827 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2828
2829 if (keyctl(KEYCTL_LINK,
2830 KEY_SPEC_USER_KEYRING,
2831 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2832 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2833 goto out;
2834 }
2835 }
2836
2837 /* Restore uid/gid back */
2838 if (uid_is_valid(uid) && uid != saved_uid) {
2839 if (setreuid(saved_uid, -1) < 0) {
2840 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2841 goto out;
2842 }
2843 }
2844
2845 if (gid_is_valid(gid) && gid != saved_gid) {
2846 if (setregid(saved_gid, -1) < 0)
2847 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2848 }
2849
2850 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2851 if (!sd_id128_is_null(u->invocation_id)) {
2852 key_serial_t key;
2853
2854 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2855 if (key == -1)
2856 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2857 else {
2858 if (keyctl(KEYCTL_SETPERM, key,
2859 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2860 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2861 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2862 }
2863 }
2864
2865 out:
2866 /* Revert back uid & gid for the last time, and exit */
2867 /* no extra logging, as only the first already reported error matters */
2868 if (getuid() != saved_uid)
2869 (void) setreuid(saved_uid, -1);
2870
2871 if (getgid() != saved_gid)
2872 (void) setregid(saved_gid, -1);
2873
2874 return r;
2875 }
2876
2877 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2878 assert(array);
2879 assert(n);
2880 assert(pair);
2881
2882 if (pair[0] >= 0)
2883 array[(*n)++] = pair[0];
2884 if (pair[1] >= 0)
2885 array[(*n)++] = pair[1];
2886 }
2887
2888 static int close_remaining_fds(
2889 const ExecParameters *params,
2890 const ExecRuntime *runtime,
2891 const DynamicCreds *dcreds,
2892 int user_lookup_fd,
2893 int socket_fd,
2894 int exec_fd,
2895 const int *fds, size_t n_fds) {
2896
2897 size_t n_dont_close = 0;
2898 int dont_close[n_fds + 12];
2899
2900 assert(params);
2901
2902 if (params->stdin_fd >= 0)
2903 dont_close[n_dont_close++] = params->stdin_fd;
2904 if (params->stdout_fd >= 0)
2905 dont_close[n_dont_close++] = params->stdout_fd;
2906 if (params->stderr_fd >= 0)
2907 dont_close[n_dont_close++] = params->stderr_fd;
2908
2909 if (socket_fd >= 0)
2910 dont_close[n_dont_close++] = socket_fd;
2911 if (exec_fd >= 0)
2912 dont_close[n_dont_close++] = exec_fd;
2913 if (n_fds > 0) {
2914 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2915 n_dont_close += n_fds;
2916 }
2917
2918 if (runtime)
2919 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2920
2921 if (dcreds) {
2922 if (dcreds->user)
2923 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2924 if (dcreds->group)
2925 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2926 }
2927
2928 if (user_lookup_fd >= 0)
2929 dont_close[n_dont_close++] = user_lookup_fd;
2930
2931 return close_all_fds(dont_close, n_dont_close);
2932 }
2933
2934 static int send_user_lookup(
2935 Unit *unit,
2936 int user_lookup_fd,
2937 uid_t uid,
2938 gid_t gid) {
2939
2940 assert(unit);
2941
2942 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2943 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2944 * specified. */
2945
2946 if (user_lookup_fd < 0)
2947 return 0;
2948
2949 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2950 return 0;
2951
2952 if (writev(user_lookup_fd,
2953 (struct iovec[]) {
2954 IOVEC_INIT(&uid, sizeof(uid)),
2955 IOVEC_INIT(&gid, sizeof(gid)),
2956 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2957 return -errno;
2958
2959 return 0;
2960 }
2961
2962 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2963 int r;
2964
2965 assert(c);
2966 assert(home);
2967 assert(buf);
2968
2969 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2970
2971 if (*home)
2972 return 0;
2973
2974 if (!c->working_directory_home)
2975 return 0;
2976
2977 r = get_home_dir(buf);
2978 if (r < 0)
2979 return r;
2980
2981 *home = *buf;
2982 return 1;
2983 }
2984
2985 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2986 _cleanup_strv_free_ char ** list = NULL;
2987 ExecDirectoryType t;
2988 int r;
2989
2990 assert(c);
2991 assert(p);
2992 assert(ret);
2993
2994 assert(c->dynamic_user);
2995
2996 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2997 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2998 * directories. */
2999
3000 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3001 char **i;
3002
3003 if (t == EXEC_DIRECTORY_CONFIGURATION)
3004 continue;
3005
3006 if (!p->prefix[t])
3007 continue;
3008
3009 STRV_FOREACH(i, c->directories[t].paths) {
3010 char *e;
3011
3012 if (exec_directory_is_private(c, t))
3013 e = path_join(p->prefix[t], "private", *i);
3014 else
3015 e = path_join(p->prefix[t], *i);
3016 if (!e)
3017 return -ENOMEM;
3018
3019 r = strv_consume(&list, e);
3020 if (r < 0)
3021 return r;
3022 }
3023 }
3024
3025 *ret = TAKE_PTR(list);
3026
3027 return 0;
3028 }
3029
3030 static char *exec_command_line(char **argv);
3031
3032 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3033 bool using_subcgroup;
3034 char *p;
3035
3036 assert(params);
3037 assert(ret);
3038
3039 if (!params->cgroup_path)
3040 return -EINVAL;
3041
3042 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3043 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3044 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3045 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3046 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3047 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3048 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3049 * flag, which is only passed for the former statements, not for the latter. */
3050
3051 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3052 if (using_subcgroup)
3053 p = path_join(params->cgroup_path, ".control");
3054 else
3055 p = strdup(params->cgroup_path);
3056 if (!p)
3057 return -ENOMEM;
3058
3059 *ret = p;
3060 return using_subcgroup;
3061 }
3062
3063 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3064 _cleanup_(cpu_set_reset) CPUSet s = {};
3065 int r;
3066
3067 assert(c);
3068 assert(ret);
3069
3070 if (!c->numa_policy.nodes.set) {
3071 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3072 return 0;
3073 }
3074
3075 r = numa_to_cpu_set(&c->numa_policy, &s);
3076 if (r < 0)
3077 return r;
3078
3079 cpu_set_reset(ret);
3080
3081 return cpu_set_add_all(ret, &s);
3082 }
3083
3084 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3085 assert(c);
3086
3087 return c->cpu_affinity_from_numa;
3088 }
3089
3090 static int exec_child(
3091 Unit *unit,
3092 const ExecCommand *command,
3093 const ExecContext *context,
3094 const ExecParameters *params,
3095 ExecRuntime *runtime,
3096 DynamicCreds *dcreds,
3097 int socket_fd,
3098 const int named_iofds[static 3],
3099 int *fds,
3100 size_t n_socket_fds,
3101 size_t n_storage_fds,
3102 char **files_env,
3103 int user_lookup_fd,
3104 int *exit_status) {
3105
3106 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3107 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3108 _cleanup_free_ gid_t *supplementary_gids = NULL;
3109 const char *username = NULL, *groupname = NULL;
3110 _cleanup_free_ char *home_buffer = NULL;
3111 const char *home = NULL, *shell = NULL;
3112 char **final_argv = NULL;
3113 dev_t journal_stream_dev = 0;
3114 ino_t journal_stream_ino = 0;
3115 bool userns_set_up = false;
3116 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3117 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3118 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3119 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3120 #if HAVE_SELINUX
3121 _cleanup_free_ char *mac_selinux_context_net = NULL;
3122 bool use_selinux = false;
3123 #endif
3124 #if ENABLE_SMACK
3125 bool use_smack = false;
3126 #endif
3127 #if HAVE_APPARMOR
3128 bool use_apparmor = false;
3129 #endif
3130 uid_t saved_uid = getuid();
3131 gid_t saved_gid = getgid();
3132 uid_t uid = UID_INVALID;
3133 gid_t gid = GID_INVALID;
3134 size_t n_fds;
3135 ExecDirectoryType dt;
3136 int secure_bits;
3137 _cleanup_free_ gid_t *gids_after_pam = NULL;
3138 int ngids_after_pam = 0;
3139
3140 assert(unit);
3141 assert(command);
3142 assert(context);
3143 assert(params);
3144 assert(exit_status);
3145
3146 rename_process_from_path(command->path);
3147
3148 /* We reset exactly these signals, since they are the
3149 * only ones we set to SIG_IGN in the main daemon. All
3150 * others we leave untouched because we set them to
3151 * SIG_DFL or a valid handler initially, both of which
3152 * will be demoted to SIG_DFL. */
3153 (void) default_signals(SIGNALS_CRASH_HANDLER,
3154 SIGNALS_IGNORE, -1);
3155
3156 if (context->ignore_sigpipe)
3157 (void) ignore_signals(SIGPIPE, -1);
3158
3159 r = reset_signal_mask();
3160 if (r < 0) {
3161 *exit_status = EXIT_SIGNAL_MASK;
3162 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3163 }
3164
3165 if (params->idle_pipe)
3166 do_idle_pipe_dance(params->idle_pipe);
3167
3168 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3169 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3170 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3171 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3172
3173 log_forget_fds();
3174 log_set_open_when_needed(true);
3175
3176 /* In case anything used libc syslog(), close this here, too */
3177 closelog();
3178
3179 n_fds = n_socket_fds + n_storage_fds;
3180 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3181 if (r < 0) {
3182 *exit_status = EXIT_FDS;
3183 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3184 }
3185
3186 if (!context->same_pgrp)
3187 if (setsid() < 0) {
3188 *exit_status = EXIT_SETSID;
3189 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3190 }
3191
3192 exec_context_tty_reset(context, params);
3193
3194 if (unit_shall_confirm_spawn(unit)) {
3195 const char *vc = params->confirm_spawn;
3196 _cleanup_free_ char *cmdline = NULL;
3197
3198 cmdline = exec_command_line(command->argv);
3199 if (!cmdline) {
3200 *exit_status = EXIT_MEMORY;
3201 return log_oom();
3202 }
3203
3204 r = ask_for_confirmation(vc, unit, cmdline);
3205 if (r != CONFIRM_EXECUTE) {
3206 if (r == CONFIRM_PRETEND_SUCCESS) {
3207 *exit_status = EXIT_SUCCESS;
3208 return 0;
3209 }
3210 *exit_status = EXIT_CONFIRM;
3211 log_unit_error(unit, "Execution cancelled by the user");
3212 return -ECANCELED;
3213 }
3214 }
3215
3216 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3217 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3218 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3219 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3220 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3221 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3222 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3223 *exit_status = EXIT_MEMORY;
3224 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3225 }
3226
3227 if (context->dynamic_user && dcreds) {
3228 _cleanup_strv_free_ char **suggested_paths = NULL;
3229
3230 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3231 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3232 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3233 *exit_status = EXIT_USER;
3234 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3235 }
3236
3237 r = compile_suggested_paths(context, params, &suggested_paths);
3238 if (r < 0) {
3239 *exit_status = EXIT_MEMORY;
3240 return log_oom();
3241 }
3242
3243 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3244 if (r < 0) {
3245 *exit_status = EXIT_USER;
3246 if (r == -EILSEQ) {
3247 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3248 return -EOPNOTSUPP;
3249 }
3250 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3251 }
3252
3253 if (!uid_is_valid(uid)) {
3254 *exit_status = EXIT_USER;
3255 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3256 return -ESRCH;
3257 }
3258
3259 if (!gid_is_valid(gid)) {
3260 *exit_status = EXIT_USER;
3261 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3262 return -ESRCH;
3263 }
3264
3265 if (dcreds->user)
3266 username = dcreds->user->name;
3267
3268 } else {
3269 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3270 if (r < 0) {
3271 *exit_status = EXIT_USER;
3272 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3273 }
3274
3275 r = get_fixed_group(context, &groupname, &gid);
3276 if (r < 0) {
3277 *exit_status = EXIT_GROUP;
3278 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3279 }
3280 }
3281
3282 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3283 r = get_supplementary_groups(context, username, groupname, gid,
3284 &supplementary_gids, &ngids);
3285 if (r < 0) {
3286 *exit_status = EXIT_GROUP;
3287 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3288 }
3289
3290 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3291 if (r < 0) {
3292 *exit_status = EXIT_USER;
3293 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3294 }
3295
3296 user_lookup_fd = safe_close(user_lookup_fd);
3297
3298 r = acquire_home(context, uid, &home, &home_buffer);
3299 if (r < 0) {
3300 *exit_status = EXIT_CHDIR;
3301 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3302 }
3303
3304 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3305 * must sure to drop O_NONBLOCK */
3306 if (socket_fd >= 0)
3307 (void) fd_nonblock(socket_fd, false);
3308
3309 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3310 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3311 if (params->cgroup_path) {
3312 _cleanup_free_ char *p = NULL;
3313
3314 r = exec_parameters_get_cgroup_path(params, &p);
3315 if (r < 0) {
3316 *exit_status = EXIT_CGROUP;
3317 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3318 }
3319
3320 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3321 if (r < 0) {
3322 *exit_status = EXIT_CGROUP;
3323 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3324 }
3325 }
3326
3327 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3328 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3329 if (r < 0) {
3330 *exit_status = EXIT_NETWORK;
3331 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3332 }
3333 }
3334
3335 r = setup_input(context, params, socket_fd, named_iofds);
3336 if (r < 0) {
3337 *exit_status = EXIT_STDIN;
3338 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3339 }
3340
3341 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3342 if (r < 0) {
3343 *exit_status = EXIT_STDOUT;
3344 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3345 }
3346
3347 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3348 if (r < 0) {
3349 *exit_status = EXIT_STDERR;
3350 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3351 }
3352
3353 if (context->oom_score_adjust_set) {
3354 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3355 * prohibit write access to this file, and we shouldn't trip up over that. */
3356 r = set_oom_score_adjust(context->oom_score_adjust);
3357 if (IN_SET(r, -EPERM, -EACCES))
3358 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3359 else if (r < 0) {
3360 *exit_status = EXIT_OOM_ADJUST;
3361 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3362 }
3363 }
3364
3365 if (context->coredump_filter_set) {
3366 r = set_coredump_filter(context->coredump_filter);
3367 if (ERRNO_IS_PRIVILEGE(r))
3368 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3369 else if (r < 0)
3370 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3371 }
3372
3373 if (context->nice_set) {
3374 r = setpriority_closest(context->nice);
3375 if (r < 0)
3376 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3377 }
3378
3379 if (context->cpu_sched_set) {
3380 struct sched_param param = {
3381 .sched_priority = context->cpu_sched_priority,
3382 };
3383
3384 r = sched_setscheduler(0,
3385 context->cpu_sched_policy |
3386 (context->cpu_sched_reset_on_fork ?
3387 SCHED_RESET_ON_FORK : 0),
3388 &param);
3389 if (r < 0) {
3390 *exit_status = EXIT_SETSCHEDULER;
3391 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3392 }
3393 }
3394
3395 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3396 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3397 const CPUSet *cpu_set;
3398
3399 if (context->cpu_affinity_from_numa) {
3400 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3401 if (r < 0) {
3402 *exit_status = EXIT_CPUAFFINITY;
3403 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3404 }
3405
3406 cpu_set = &converted_cpu_set;
3407 } else
3408 cpu_set = &context->cpu_set;
3409
3410 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3411 *exit_status = EXIT_CPUAFFINITY;
3412 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3413 }
3414 }
3415
3416 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3417 r = apply_numa_policy(&context->numa_policy);
3418 if (r == -EOPNOTSUPP)
3419 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3420 else if (r < 0) {
3421 *exit_status = EXIT_NUMA_POLICY;
3422 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3423 }
3424 }
3425
3426 if (context->ioprio_set)
3427 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3428 *exit_status = EXIT_IOPRIO;
3429 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3430 }
3431
3432 if (context->timer_slack_nsec != NSEC_INFINITY)
3433 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3434 *exit_status = EXIT_TIMERSLACK;
3435 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3436 }
3437
3438 if (context->personality != PERSONALITY_INVALID) {
3439 r = safe_personality(context->personality);
3440 if (r < 0) {
3441 *exit_status = EXIT_PERSONALITY;
3442 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3443 }
3444 }
3445
3446 if (context->utmp_id)
3447 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3448 context->tty_path,
3449 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3450 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3451 USER_PROCESS,
3452 username);
3453
3454 if (uid_is_valid(uid)) {
3455 r = chown_terminal(STDIN_FILENO, uid);
3456 if (r < 0) {
3457 *exit_status = EXIT_STDIN;
3458 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3459 }
3460 }
3461
3462 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3463 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3464 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3465 * touch a single hierarchy too. */
3466 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3467 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3468 if (r < 0) {
3469 *exit_status = EXIT_CGROUP;
3470 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3471 }
3472 }
3473
3474 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3475 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3476 if (r < 0)
3477 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3478 }
3479
3480 r = build_environment(
3481 unit,
3482 context,
3483 params,
3484 n_fds,
3485 home,
3486 username,
3487 shell,
3488 journal_stream_dev,
3489 journal_stream_ino,
3490 &our_env);
3491 if (r < 0) {
3492 *exit_status = EXIT_MEMORY;
3493 return log_oom();
3494 }
3495
3496 r = build_pass_environment(context, &pass_env);
3497 if (r < 0) {
3498 *exit_status = EXIT_MEMORY;
3499 return log_oom();
3500 }
3501
3502 accum_env = strv_env_merge(5,
3503 params->environment,
3504 our_env,
3505 pass_env,
3506 context->environment,
3507 files_env);
3508 if (!accum_env) {
3509 *exit_status = EXIT_MEMORY;
3510 return log_oom();
3511 }
3512 accum_env = strv_env_clean(accum_env);
3513
3514 (void) umask(context->umask);
3515
3516 r = setup_keyring(unit, context, params, uid, gid);
3517 if (r < 0) {
3518 *exit_status = EXIT_KEYRING;
3519 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3520 }
3521
3522 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3523 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3524
3525 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3526 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3527
3528 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3529 if (needs_ambient_hack)
3530 needs_setuid = false;
3531 else
3532 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3533
3534 if (needs_sandboxing) {
3535 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3536 * present. The actual MAC context application will happen later, as late as possible, to avoid
3537 * impacting our own code paths. */
3538
3539 #if HAVE_SELINUX
3540 use_selinux = mac_selinux_use();
3541 #endif
3542 #if ENABLE_SMACK
3543 use_smack = mac_smack_use();
3544 #endif
3545 #if HAVE_APPARMOR
3546 use_apparmor = mac_apparmor_use();
3547 #endif
3548 }
3549
3550 if (needs_sandboxing) {
3551 int which_failed;
3552
3553 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3554 * is set here. (See below.) */
3555
3556 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3557 if (r < 0) {
3558 *exit_status = EXIT_LIMITS;
3559 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3560 }
3561 }
3562
3563 if (needs_setuid) {
3564
3565 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3566 * wins here. (See above.) */
3567
3568 if (context->pam_name && username) {
3569 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3570 if (r < 0) {
3571 *exit_status = EXIT_PAM;
3572 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3573 }
3574
3575 ngids_after_pam = getgroups_alloc(&gids_after_pam);
3576 if (ngids_after_pam < 0) {
3577 *exit_status = EXIT_MEMORY;
3578 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3579 }
3580 }
3581 }
3582
3583 if (needs_sandboxing) {
3584 #if HAVE_SELINUX
3585 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3586 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3587 if (r < 0) {
3588 *exit_status = EXIT_SELINUX_CONTEXT;
3589 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3590 }
3591 }
3592 #endif
3593
3594 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3595 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3596 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3597 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3598 userns_set_up = true;
3599 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3600 if (r < 0) {
3601 *exit_status = EXIT_USER;
3602 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3603 }
3604 }
3605 }
3606
3607 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3608
3609 if (ns_type_supported(NAMESPACE_NET)) {
3610 r = setup_netns(runtime->netns_storage_socket);
3611 if (r == -EPERM)
3612 log_unit_warning_errno(unit, r,
3613 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3614 else if (r < 0) {
3615 *exit_status = EXIT_NETWORK;
3616 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3617 }
3618 } else if (context->network_namespace_path) {
3619 *exit_status = EXIT_NETWORK;
3620 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3621 "NetworkNamespacePath= is not supported, refusing.");
3622 } else
3623 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3624 }
3625
3626 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3627 if (needs_mount_namespace) {
3628 _cleanup_free_ char *error_path = NULL;
3629
3630 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3631 if (r < 0) {
3632 *exit_status = EXIT_NAMESPACE;
3633 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3634 error_path ? ": " : "", strempty(error_path));
3635 }
3636 }
3637
3638 if (needs_sandboxing) {
3639 r = apply_protect_hostname(unit, context, exit_status);
3640 if (r < 0)
3641 return r;
3642 }
3643
3644 /* Drop groups as early as possible.
3645 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3646 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3647 if (needs_setuid) {
3648 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3649 int ngids_to_enforce = 0;
3650
3651 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3652 ngids,
3653 gids_after_pam,
3654 ngids_after_pam,
3655 &gids_to_enforce);
3656 if (ngids_to_enforce < 0) {
3657 *exit_status = EXIT_MEMORY;
3658 return log_unit_error_errno(unit,
3659 ngids_to_enforce,
3660 "Failed to merge group lists. Group membership might be incorrect: %m");
3661 }
3662
3663 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3664 if (r < 0) {
3665 *exit_status = EXIT_GROUP;
3666 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3667 }
3668 }
3669
3670 /* If the user namespace was not set up above, try to do it now.
3671 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3672 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3673 * case of mount namespaces being less privileged when the mount point list is copied from a
3674 * different user namespace). */
3675
3676 if (needs_sandboxing && context->private_users && !userns_set_up) {
3677 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3678 if (r < 0) {
3679 *exit_status = EXIT_USER;
3680 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3681 }
3682 }
3683
3684 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3685 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3686 * however if we have it as we want to keep it open until the final execve(). */
3687
3688 if (params->exec_fd >= 0) {
3689 exec_fd = params->exec_fd;
3690
3691 if (exec_fd < 3 + (int) n_fds) {
3692 int moved_fd;
3693
3694 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3695 * process we are about to execute. */
3696
3697 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3698 if (moved_fd < 0) {
3699 *exit_status = EXIT_FDS;
3700 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3701 }
3702
3703 safe_close(exec_fd);
3704 exec_fd = moved_fd;
3705 } else {
3706 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3707 r = fd_cloexec(exec_fd, true);
3708 if (r < 0) {
3709 *exit_status = EXIT_FDS;
3710 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3711 }
3712 }
3713
3714 fds_with_exec_fd = newa(int, n_fds + 1);
3715 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3716 fds_with_exec_fd[n_fds] = exec_fd;
3717 n_fds_with_exec_fd = n_fds + 1;
3718 } else {
3719 fds_with_exec_fd = fds;
3720 n_fds_with_exec_fd = n_fds;
3721 }
3722
3723 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3724 if (r >= 0)
3725 r = shift_fds(fds, n_fds);
3726 if (r >= 0)
3727 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3728 if (r < 0) {
3729 *exit_status = EXIT_FDS;
3730 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3731 }
3732
3733 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3734 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3735 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3736 * came this far. */
3737
3738 secure_bits = context->secure_bits;
3739
3740 if (needs_sandboxing) {
3741 uint64_t bset;
3742
3743 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3744 * requested. (Note this is placed after the general resource limit initialization, see
3745 * above, in order to take precedence.) */
3746 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3747 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3748 *exit_status = EXIT_LIMITS;
3749 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3750 }
3751 }
3752
3753 #if ENABLE_SMACK
3754 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3755 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3756 if (use_smack) {
3757 r = setup_smack(context, command);
3758 if (r < 0) {
3759 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3760 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3761 }
3762 }
3763 #endif
3764
3765 bset = context->capability_bounding_set;
3766 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3767 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3768 * instead of us doing that */
3769 if (needs_ambient_hack)
3770 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3771 (UINT64_C(1) << CAP_SETUID) |
3772 (UINT64_C(1) << CAP_SETGID);
3773
3774 if (!cap_test_all(bset)) {
3775 r = capability_bounding_set_drop(bset, false);
3776 if (r < 0) {
3777 *exit_status = EXIT_CAPABILITIES;
3778 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3779 }
3780 }
3781
3782 /* This is done before enforce_user, but ambient set
3783 * does not survive over setresuid() if keep_caps is not set. */
3784 if (!needs_ambient_hack) {
3785 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3786 if (r < 0) {
3787 *exit_status = EXIT_CAPABILITIES;
3788 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3789 }
3790 }
3791 }
3792
3793 /* chroot to root directory first, before we lose the ability to chroot */
3794 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3795 if (r < 0)
3796 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3797
3798 if (needs_setuid) {
3799 if (uid_is_valid(uid)) {
3800 r = enforce_user(context, uid);
3801 if (r < 0) {
3802 *exit_status = EXIT_USER;
3803 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3804 }
3805
3806 if (!needs_ambient_hack &&
3807 context->capability_ambient_set != 0) {
3808
3809 /* Fix the ambient capabilities after user change. */
3810 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3811 if (r < 0) {
3812 *exit_status = EXIT_CAPABILITIES;
3813 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3814 }
3815
3816 /* If we were asked to change user and ambient capabilities
3817 * were requested, we had to add keep-caps to the securebits
3818 * so that we would maintain the inherited capability set
3819 * through the setresuid(). Make sure that the bit is added
3820 * also to the context secure_bits so that we don't try to
3821 * drop the bit away next. */
3822
3823 secure_bits |= 1<<SECURE_KEEP_CAPS;
3824 }
3825 }
3826 }
3827
3828 /* Apply working directory here, because the working directory might be on NFS and only the user running
3829 * this service might have the correct privilege to change to the working directory */
3830 r = apply_working_directory(context, params, home, exit_status);
3831 if (r < 0)
3832 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3833
3834 if (needs_sandboxing) {
3835 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3836 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3837 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3838 * are restricted. */
3839
3840 #if HAVE_SELINUX
3841 if (use_selinux) {
3842 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3843
3844 if (exec_context) {
3845 r = setexeccon(exec_context);
3846 if (r < 0) {
3847 *exit_status = EXIT_SELINUX_CONTEXT;
3848 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3849 }
3850 }
3851 }
3852 #endif
3853
3854 #if HAVE_APPARMOR
3855 if (use_apparmor && context->apparmor_profile) {
3856 r = aa_change_onexec(context->apparmor_profile);
3857 if (r < 0 && !context->apparmor_profile_ignore) {
3858 *exit_status = EXIT_APPARMOR_PROFILE;
3859 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3860 }
3861 }
3862 #endif
3863
3864 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3865 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3866 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3867 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3868 *exit_status = EXIT_SECUREBITS;
3869 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3870 }
3871
3872 if (context_has_no_new_privileges(context))
3873 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3874 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3875 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3876 }
3877
3878 #if HAVE_SECCOMP
3879 r = apply_address_families(unit, context);
3880 if (r < 0) {
3881 *exit_status = EXIT_ADDRESS_FAMILIES;
3882 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3883 }
3884
3885 r = apply_memory_deny_write_execute(unit, context);
3886 if (r < 0) {
3887 *exit_status = EXIT_SECCOMP;
3888 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3889 }
3890
3891 r = apply_restrict_realtime(unit, context);
3892 if (r < 0) {
3893 *exit_status = EXIT_SECCOMP;
3894 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3895 }
3896
3897 r = apply_restrict_suid_sgid(unit, context);
3898 if (r < 0) {
3899 *exit_status = EXIT_SECCOMP;
3900 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3901 }
3902
3903 r = apply_restrict_namespaces(unit, context);
3904 if (r < 0) {
3905 *exit_status = EXIT_SECCOMP;
3906 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3907 }
3908
3909 r = apply_protect_sysctl(unit, context);
3910 if (r < 0) {
3911 *exit_status = EXIT_SECCOMP;
3912 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3913 }
3914
3915 r = apply_protect_kernel_modules(unit, context);
3916 if (r < 0) {
3917 *exit_status = EXIT_SECCOMP;
3918 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3919 }
3920
3921 r = apply_protect_kernel_logs(unit, context);
3922 if (r < 0) {
3923 *exit_status = EXIT_SECCOMP;
3924 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3925 }
3926
3927 r = apply_protect_clock(unit, context);
3928 if (r < 0) {
3929 *exit_status = EXIT_SECCOMP;
3930 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3931 }
3932
3933 r = apply_private_devices(unit, context);
3934 if (r < 0) {
3935 *exit_status = EXIT_SECCOMP;
3936 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3937 }
3938
3939 r = apply_syscall_archs(unit, context);
3940 if (r < 0) {
3941 *exit_status = EXIT_SECCOMP;
3942 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3943 }
3944
3945 r = apply_lock_personality(unit, context);
3946 if (r < 0) {
3947 *exit_status = EXIT_SECCOMP;
3948 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3949 }
3950
3951 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3952 * by the filter as little as possible. */
3953 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3954 if (r < 0) {
3955 *exit_status = EXIT_SECCOMP;
3956 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3957 }
3958 #endif
3959 }
3960
3961 if (!strv_isempty(context->unset_environment)) {
3962 char **ee = NULL;
3963
3964 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3965 if (!ee) {
3966 *exit_status = EXIT_MEMORY;
3967 return log_oom();
3968 }
3969
3970 strv_free_and_replace(accum_env, ee);
3971 }
3972
3973 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3974 replaced_argv = replace_env_argv(command->argv, accum_env);
3975 if (!replaced_argv) {
3976 *exit_status = EXIT_MEMORY;
3977 return log_oom();
3978 }
3979 final_argv = replaced_argv;
3980 } else
3981 final_argv = command->argv;
3982
3983 if (DEBUG_LOGGING) {
3984 _cleanup_free_ char *line;
3985
3986 line = exec_command_line(final_argv);
3987 if (line)
3988 log_struct(LOG_DEBUG,
3989 "EXECUTABLE=%s", command->path,
3990 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3991 LOG_UNIT_ID(unit),
3992 LOG_UNIT_INVOCATION_ID(unit));
3993 }
3994
3995 if (exec_fd >= 0) {
3996 uint8_t hot = 1;
3997
3998 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3999 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4000
4001 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4002 *exit_status = EXIT_EXEC;
4003 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4004 }
4005 }
4006
4007 execve(command->path, final_argv, accum_env);
4008 r = -errno;
4009
4010 if (exec_fd >= 0) {
4011 uint8_t hot = 0;
4012
4013 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4014 * that POLLHUP on it no longer means execve() succeeded. */
4015
4016 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4017 *exit_status = EXIT_EXEC;
4018 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4019 }
4020 }
4021
4022 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4023 log_struct_errno(LOG_INFO, r,
4024 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4025 LOG_UNIT_ID(unit),
4026 LOG_UNIT_INVOCATION_ID(unit),
4027 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4028 command->path),
4029 "EXECUTABLE=%s", command->path);
4030 return 0;
4031 }
4032
4033 *exit_status = EXIT_EXEC;
4034 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4035 }
4036
4037 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4038 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4039
4040 int exec_spawn(Unit *unit,
4041 ExecCommand *command,
4042 const ExecContext *context,
4043 const ExecParameters *params,
4044 ExecRuntime *runtime,
4045 DynamicCreds *dcreds,
4046 pid_t *ret) {
4047
4048 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4049 _cleanup_free_ char *subcgroup_path = NULL;
4050 _cleanup_strv_free_ char **files_env = NULL;
4051 size_t n_storage_fds = 0, n_socket_fds = 0;
4052 _cleanup_free_ char *line = NULL;
4053 pid_t pid;
4054
4055 assert(unit);
4056 assert(command);
4057 assert(context);
4058 assert(ret);
4059 assert(params);
4060 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4061
4062 if (context->std_input == EXEC_INPUT_SOCKET ||
4063 context->std_output == EXEC_OUTPUT_SOCKET ||
4064 context->std_error == EXEC_OUTPUT_SOCKET) {
4065
4066 if (params->n_socket_fds > 1) {
4067 log_unit_error(unit, "Got more than one socket.");
4068 return -EINVAL;
4069 }
4070
4071 if (params->n_socket_fds == 0) {
4072 log_unit_error(unit, "Got no socket.");
4073 return -EINVAL;
4074 }
4075
4076 socket_fd = params->fds[0];
4077 } else {
4078 socket_fd = -1;
4079 fds = params->fds;
4080 n_socket_fds = params->n_socket_fds;
4081 n_storage_fds = params->n_storage_fds;
4082 }
4083
4084 r = exec_context_named_iofds(context, params, named_iofds);
4085 if (r < 0)
4086 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4087
4088 r = exec_context_load_environment(unit, context, &files_env);
4089 if (r < 0)
4090 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4091
4092 line = exec_command_line(command->argv);
4093 if (!line)
4094 return log_oom();
4095
4096 log_struct(LOG_DEBUG,
4097 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4098 "EXECUTABLE=%s", command->path,
4099 LOG_UNIT_ID(unit),
4100 LOG_UNIT_INVOCATION_ID(unit));
4101
4102 if (params->cgroup_path) {
4103 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4104 if (r < 0)
4105 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4106 if (r > 0) { /* We are using a child cgroup */
4107 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4108 if (r < 0)
4109 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4110 }
4111 }
4112
4113 pid = fork();
4114 if (pid < 0)
4115 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4116
4117 if (pid == 0) {
4118 int exit_status = EXIT_SUCCESS;
4119
4120 r = exec_child(unit,
4121 command,
4122 context,
4123 params,
4124 runtime,
4125 dcreds,
4126 socket_fd,
4127 named_iofds,
4128 fds,
4129 n_socket_fds,
4130 n_storage_fds,
4131 files_env,
4132 unit->manager->user_lookup_fds[1],
4133 &exit_status);
4134
4135 if (r < 0) {
4136 const char *status =
4137 exit_status_to_string(exit_status,
4138 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4139
4140 log_struct_errno(LOG_ERR, r,
4141 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4142 LOG_UNIT_ID(unit),
4143 LOG_UNIT_INVOCATION_ID(unit),
4144 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4145 status, command->path),
4146 "EXECUTABLE=%s", command->path);
4147 }
4148
4149 _exit(exit_status);
4150 }
4151
4152 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4153
4154 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4155 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4156 * process will be killed too). */
4157 if (subcgroup_path)
4158 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4159
4160 exec_status_start(&command->exec_status, pid);
4161
4162 *ret = pid;
4163 return 0;
4164 }
4165
4166 void exec_context_init(ExecContext *c) {
4167 ExecDirectoryType i;
4168
4169 assert(c);
4170
4171 c->umask = 0022;
4172 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4173 c->cpu_sched_policy = SCHED_OTHER;
4174 c->syslog_priority = LOG_DAEMON|LOG_INFO;
4175 c->syslog_level_prefix = true;
4176 c->ignore_sigpipe = true;
4177 c->timer_slack_nsec = NSEC_INFINITY;
4178 c->personality = PERSONALITY_INVALID;
4179 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4180 c->directories[i].mode = 0755;
4181 c->timeout_clean_usec = USEC_INFINITY;
4182 c->capability_bounding_set = CAP_ALL;
4183 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4184 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4185 c->log_level_max = -1;
4186 numa_policy_reset(&c->numa_policy);
4187 }
4188
4189 void exec_context_done(ExecContext *c) {
4190 ExecDirectoryType i;
4191 size_t l;
4192
4193 assert(c);
4194
4195 c->environment = strv_free(c->environment);
4196 c->environment_files = strv_free(c->environment_files);
4197 c->pass_environment = strv_free(c->pass_environment);
4198 c->unset_environment = strv_free(c->unset_environment);
4199
4200 rlimit_free_all(c->rlimit);
4201
4202 for (l = 0; l < 3; l++) {
4203 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4204 c->stdio_file[l] = mfree(c->stdio_file[l]);
4205 }
4206
4207 c->working_directory = mfree(c->working_directory);
4208 c->root_directory = mfree(c->root_directory);
4209 c->root_image = mfree(c->root_image);
4210 c->root_hash = mfree(c->root_hash);
4211 c->root_hash_size = 0;
4212 c->root_hash_path = mfree(c->root_hash_path);
4213 c->root_hash_sig = mfree(c->root_hash_sig);
4214 c->root_hash_sig_size = 0;
4215 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
4216 c->root_verity = mfree(c->root_verity);
4217 c->tty_path = mfree(c->tty_path);
4218 c->syslog_identifier = mfree(c->syslog_identifier);
4219 c->user = mfree(c->user);
4220 c->group = mfree(c->group);
4221
4222 c->supplementary_groups = strv_free(c->supplementary_groups);
4223
4224 c->pam_name = mfree(c->pam_name);
4225
4226 c->read_only_paths = strv_free(c->read_only_paths);
4227 c->read_write_paths = strv_free(c->read_write_paths);
4228 c->inaccessible_paths = strv_free(c->inaccessible_paths);
4229
4230 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4231 c->bind_mounts = NULL;
4232 c->n_bind_mounts = 0;
4233 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4234 c->temporary_filesystems = NULL;
4235 c->n_temporary_filesystems = 0;
4236
4237 cpu_set_reset(&c->cpu_set);
4238 numa_policy_reset(&c->numa_policy);
4239
4240 c->utmp_id = mfree(c->utmp_id);
4241 c->selinux_context = mfree(c->selinux_context);
4242 c->apparmor_profile = mfree(c->apparmor_profile);
4243 c->smack_process_label = mfree(c->smack_process_label);
4244
4245 c->syscall_filter = hashmap_free(c->syscall_filter);
4246 c->syscall_archs = set_free(c->syscall_archs);
4247 c->address_families = set_free(c->address_families);
4248
4249 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4250 c->directories[i].paths = strv_free(c->directories[i].paths);
4251
4252 c->log_level_max = -1;
4253
4254 exec_context_free_log_extra_fields(c);
4255
4256 c->log_ratelimit_interval_usec = 0;
4257 c->log_ratelimit_burst = 0;
4258
4259 c->stdin_data = mfree(c->stdin_data);
4260 c->stdin_data_size = 0;
4261
4262 c->network_namespace_path = mfree(c->network_namespace_path);
4263
4264 c->log_namespace = mfree(c->log_namespace);
4265 }
4266
4267 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4268 char **i;
4269
4270 assert(c);
4271
4272 if (!runtime_prefix)
4273 return 0;
4274
4275 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4276 _cleanup_free_ char *p;
4277
4278 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4279 p = path_join(runtime_prefix, "private", *i);
4280 else
4281 p = path_join(runtime_prefix, *i);
4282 if (!p)
4283 return -ENOMEM;
4284
4285 /* We execute this synchronously, since we need to be sure this is gone when we start the
4286 * service next. */
4287 (void) rm_rf(p, REMOVE_ROOT);
4288 }
4289
4290 return 0;
4291 }
4292
4293 static void exec_command_done(ExecCommand *c) {
4294 assert(c);
4295
4296 c->path = mfree(c->path);
4297 c->argv = strv_free(c->argv);
4298 }
4299
4300 void exec_command_done_array(ExecCommand *c, size_t n) {
4301 size_t i;
4302
4303 for (i = 0; i < n; i++)
4304 exec_command_done(c+i);
4305 }
4306
4307 ExecCommand* exec_command_free_list(ExecCommand *c) {
4308 ExecCommand *i;
4309
4310 while ((i = c)) {
4311 LIST_REMOVE(command, c, i);
4312 exec_command_done(i);
4313 free(i);
4314 }
4315
4316 return NULL;
4317 }
4318
4319 void exec_command_free_array(ExecCommand **c, size_t n) {
4320 size_t i;
4321
4322 for (i = 0; i < n; i++)
4323 c[i] = exec_command_free_list(c[i]);
4324 }
4325
4326 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4327 size_t i;
4328
4329 for (i = 0; i < n; i++)
4330 exec_status_reset(&c[i].exec_status);
4331 }
4332
4333 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4334 size_t i;
4335
4336 for (i = 0; i < n; i++) {
4337 ExecCommand *z;
4338
4339 LIST_FOREACH(command, z, c[i])
4340 exec_status_reset(&z->exec_status);
4341 }
4342 }
4343
4344 typedef struct InvalidEnvInfo {
4345 const Unit *unit;
4346 const char *path;
4347 } InvalidEnvInfo;
4348
4349 static void invalid_env(const char *p, void *userdata) {
4350 InvalidEnvInfo *info = userdata;
4351
4352 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4353 }
4354
4355 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4356 assert(c);
4357
4358 switch (fd_index) {
4359
4360 case STDIN_FILENO:
4361 if (c->std_input != EXEC_INPUT_NAMED_FD)
4362 return NULL;
4363
4364 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4365
4366 case STDOUT_FILENO:
4367 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4368 return NULL;
4369
4370 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4371
4372 case STDERR_FILENO:
4373 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4374 return NULL;
4375
4376 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4377
4378 default:
4379 return NULL;
4380 }
4381 }
4382
4383 static int exec_context_named_iofds(
4384 const ExecContext *c,
4385 const ExecParameters *p,
4386 int named_iofds[static 3]) {
4387
4388 size_t i, targets;
4389 const char* stdio_fdname[3];
4390 size_t n_fds;
4391
4392 assert(c);
4393 assert(p);
4394 assert(named_iofds);
4395
4396 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4397 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4398 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4399
4400 for (i = 0; i < 3; i++)
4401 stdio_fdname[i] = exec_context_fdname(c, i);
4402
4403 n_fds = p->n_storage_fds + p->n_socket_fds;
4404
4405 for (i = 0; i < n_fds && targets > 0; i++)
4406 if (named_iofds[STDIN_FILENO] < 0 &&
4407 c->std_input == EXEC_INPUT_NAMED_FD &&
4408 stdio_fdname[STDIN_FILENO] &&
4409 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4410
4411 named_iofds[STDIN_FILENO] = p->fds[i];
4412 targets--;
4413
4414 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4415 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4416 stdio_fdname[STDOUT_FILENO] &&
4417 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4418
4419 named_iofds[STDOUT_FILENO] = p->fds[i];
4420 targets--;
4421
4422 } else if (named_iofds[STDERR_FILENO] < 0 &&
4423 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4424 stdio_fdname[STDERR_FILENO] &&
4425 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4426
4427 named_iofds[STDERR_FILENO] = p->fds[i];
4428 targets--;
4429 }
4430
4431 return targets == 0 ? 0 : -ENOENT;
4432 }
4433
4434 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4435 char **i, **r = NULL;
4436
4437 assert(c);
4438 assert(l);
4439
4440 STRV_FOREACH(i, c->environment_files) {
4441 char *fn;
4442 int k;
4443 unsigned n;
4444 bool ignore = false;
4445 char **p;
4446 _cleanup_globfree_ glob_t pglob = {};
4447
4448 fn = *i;
4449
4450 if (fn[0] == '-') {
4451 ignore = true;
4452 fn++;
4453 }
4454
4455 if (!path_is_absolute(fn)) {
4456 if (ignore)
4457 continue;
4458
4459 strv_free(r);
4460 return -EINVAL;
4461 }
4462
4463 /* Filename supports globbing, take all matching files */
4464 k = safe_glob(fn, 0, &pglob);
4465 if (k < 0) {
4466 if (ignore)
4467 continue;
4468
4469 strv_free(r);
4470 return k;
4471 }
4472
4473 /* When we don't match anything, -ENOENT should be returned */
4474 assert(pglob.gl_pathc > 0);
4475
4476 for (n = 0; n < pglob.gl_pathc; n++) {
4477 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4478 if (k < 0) {
4479 if (ignore)
4480 continue;
4481
4482 strv_free(r);
4483 return k;
4484 }
4485 /* Log invalid environment variables with filename */
4486 if (p) {
4487 InvalidEnvInfo info = {
4488 .unit = unit,
4489 .path = pglob.gl_pathv[n]
4490 };
4491
4492 p = strv_env_clean_with_callback(p, invalid_env, &info);
4493 }
4494
4495 if (!r)
4496 r = p;
4497 else {
4498 char **m;
4499
4500 m = strv_env_merge(2, r, p);
4501 strv_free(r);
4502 strv_free(p);
4503 if (!m)
4504 return -ENOMEM;
4505
4506 r = m;
4507 }
4508 }
4509 }
4510
4511 *l = r;
4512
4513 return 0;
4514 }
4515
4516 static bool tty_may_match_dev_console(const char *tty) {
4517 _cleanup_free_ char *resolved = NULL;
4518
4519 if (!tty)
4520 return true;
4521
4522 tty = skip_dev_prefix(tty);
4523
4524 /* trivial identity? */
4525 if (streq(tty, "console"))
4526 return true;
4527
4528 if (resolve_dev_console(&resolved) < 0)
4529 return true; /* if we could not resolve, assume it may */
4530
4531 /* "tty0" means the active VC, so it may be the same sometimes */
4532 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4533 }
4534
4535 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4536 assert(ec);
4537
4538 return ec->tty_reset ||
4539 ec->tty_vhangup ||
4540 ec->tty_vt_disallocate ||
4541 is_terminal_input(ec->std_input) ||
4542 is_terminal_output(ec->std_output) ||
4543 is_terminal_output(ec->std_error);
4544 }
4545
4546 bool exec_context_may_touch_console(const ExecContext *ec) {
4547
4548 return exec_context_may_touch_tty(ec) &&
4549 tty_may_match_dev_console(exec_context_tty_path(ec));
4550 }
4551
4552 static void strv_fprintf(FILE *f, char **l) {
4553 char **g;
4554
4555 assert(f);
4556
4557 STRV_FOREACH(g, l)
4558 fprintf(f, " %s", *g);
4559 }
4560
4561 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4562 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4563 ExecDirectoryType dt;
4564 unsigned i;
4565 int r;
4566
4567 assert(c);
4568 assert(f);
4569
4570 prefix = strempty(prefix);
4571
4572 fprintf(f,
4573 "%sUMask: %04o\n"
4574 "%sWorkingDirectory: %s\n"
4575 "%sRootDirectory: %s\n"
4576 "%sNonBlocking: %s\n"
4577 "%sPrivateTmp: %s\n"
4578 "%sPrivateDevices: %s\n"
4579 "%sProtectKernelTunables: %s\n"
4580 "%sProtectKernelModules: %s\n"
4581 "%sProtectKernelLogs: %s\n"
4582 "%sProtectClock: %s\n"
4583 "%sProtectControlGroups: %s\n"
4584 "%sPrivateNetwork: %s\n"
4585 "%sPrivateUsers: %s\n"
4586 "%sProtectHome: %s\n"
4587 "%sProtectSystem: %s\n"
4588 "%sMountAPIVFS: %s\n"
4589 "%sIgnoreSIGPIPE: %s\n"
4590 "%sMemoryDenyWriteExecute: %s\n"
4591 "%sRestrictRealtime: %s\n"
4592 "%sRestrictSUIDSGID: %s\n"
4593 "%sKeyringMode: %s\n"
4594 "%sProtectHostname: %s\n",
4595 prefix, c->umask,
4596 prefix, c->working_directory ? c->working_directory : "/",
4597 prefix, c->root_directory ? c->root_directory : "/",
4598 prefix, yes_no(c->non_blocking),
4599 prefix, yes_no(c->private_tmp),
4600 prefix, yes_no(c->private_devices),
4601 prefix, yes_no(c->protect_kernel_tunables),
4602 prefix, yes_no(c->protect_kernel_modules),
4603 prefix, yes_no(c->protect_kernel_logs),
4604 prefix, yes_no(c->protect_clock),
4605 prefix, yes_no(c->protect_control_groups),
4606 prefix, yes_no(c->private_network),
4607 prefix, yes_no(c->private_users),
4608 prefix, protect_home_to_string(c->protect_home),
4609 prefix, protect_system_to_string(c->protect_system),
4610 prefix, yes_no(c->mount_apivfs),
4611 prefix, yes_no(c->ignore_sigpipe),
4612 prefix, yes_no(c->memory_deny_write_execute),
4613 prefix, yes_no(c->restrict_realtime),
4614 prefix, yes_no(c->restrict_suid_sgid),
4615 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4616 prefix, yes_no(c->protect_hostname));
4617
4618 if (c->root_image)
4619 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4620
4621 if (c->root_hash) {
4622 _cleanup_free_ char *encoded = NULL;
4623 encoded = hexmem(c->root_hash, c->root_hash_size);
4624 if (encoded)
4625 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
4626 }
4627
4628 if (c->root_hash_path)
4629 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
4630
4631 if (c->root_hash_sig) {
4632 _cleanup_free_ char *encoded = NULL;
4633 ssize_t len;
4634 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
4635 if (len)
4636 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
4637 }
4638
4639 if (c->root_hash_sig_path)
4640 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
4641
4642 if (c->root_verity)
4643 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
4644
4645 STRV_FOREACH(e, c->environment)
4646 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4647
4648 STRV_FOREACH(e, c->environment_files)
4649 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4650
4651 STRV_FOREACH(e, c->pass_environment)
4652 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4653
4654 STRV_FOREACH(e, c->unset_environment)
4655 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4656
4657 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4658
4659 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4660 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4661
4662 STRV_FOREACH(d, c->directories[dt].paths)
4663 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4664 }
4665
4666 fprintf(f,
4667 "%sTimeoutCleanSec: %s\n",
4668 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4669
4670 if (c->nice_set)
4671 fprintf(f,
4672 "%sNice: %i\n",
4673 prefix, c->nice);
4674
4675 if (c->oom_score_adjust_set)
4676 fprintf(f,
4677 "%sOOMScoreAdjust: %i\n",
4678 prefix, c->oom_score_adjust);
4679
4680 if (c->coredump_filter_set)
4681 fprintf(f,
4682 "%sCoredumpFilter: 0x%"PRIx64"\n",
4683 prefix, c->coredump_filter);
4684
4685 for (i = 0; i < RLIM_NLIMITS; i++)
4686 if (c->rlimit[i]) {
4687 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4688 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4689 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4690 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4691 }
4692
4693 if (c->ioprio_set) {
4694 _cleanup_free_ char *class_str = NULL;
4695
4696 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4697 if (r >= 0)
4698 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4699
4700 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4701 }
4702
4703 if (c->cpu_sched_set) {
4704 _cleanup_free_ char *policy_str = NULL;
4705
4706 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4707 if (r >= 0)
4708 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4709
4710 fprintf(f,
4711 "%sCPUSchedulingPriority: %i\n"
4712 "%sCPUSchedulingResetOnFork: %s\n",
4713 prefix, c->cpu_sched_priority,
4714 prefix, yes_no(c->cpu_sched_reset_on_fork));
4715 }
4716
4717 if (c->cpu_set.set) {
4718 _cleanup_free_ char *affinity = NULL;
4719
4720 affinity = cpu_set_to_range_string(&c->cpu_set);
4721 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4722 }
4723
4724 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4725 _cleanup_free_ char *nodes = NULL;
4726
4727 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4728 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4729 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4730 }
4731
4732 if (c->timer_slack_nsec != NSEC_INFINITY)
4733 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4734
4735 fprintf(f,
4736 "%sStandardInput: %s\n"
4737 "%sStandardOutput: %s\n"
4738 "%sStandardError: %s\n",
4739 prefix, exec_input_to_string(c->std_input),
4740 prefix, exec_output_to_string(c->std_output),
4741 prefix, exec_output_to_string(c->std_error));
4742
4743 if (c->std_input == EXEC_INPUT_NAMED_FD)
4744 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4745 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4746 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4747 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4748 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4749
4750 if (c->std_input == EXEC_INPUT_FILE)
4751 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4752 if (c->std_output == EXEC_OUTPUT_FILE)
4753 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4754 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4755 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4756 if (c->std_error == EXEC_OUTPUT_FILE)
4757 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4758 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4759 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4760
4761 if (c->tty_path)
4762 fprintf(f,
4763 "%sTTYPath: %s\n"
4764 "%sTTYReset: %s\n"
4765 "%sTTYVHangup: %s\n"
4766 "%sTTYVTDisallocate: %s\n",
4767 prefix, c->tty_path,
4768 prefix, yes_no(c->tty_reset),
4769 prefix, yes_no(c->tty_vhangup),
4770 prefix, yes_no(c->tty_vt_disallocate));
4771
4772 if (IN_SET(c->std_output,
4773 EXEC_OUTPUT_KMSG,
4774 EXEC_OUTPUT_JOURNAL,
4775 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4776 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4777 IN_SET(c->std_error,
4778 EXEC_OUTPUT_KMSG,
4779 EXEC_OUTPUT_JOURNAL,
4780 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4781 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4782
4783 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4784
4785 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4786 if (r >= 0)
4787 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4788
4789 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4790 if (r >= 0)
4791 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4792 }
4793
4794 if (c->log_level_max >= 0) {
4795 _cleanup_free_ char *t = NULL;
4796
4797 (void) log_level_to_string_alloc(c->log_level_max, &t);
4798
4799 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4800 }
4801
4802 if (c->log_ratelimit_interval_usec > 0) {
4803 char buf_timespan[FORMAT_TIMESPAN_MAX];
4804
4805 fprintf(f,
4806 "%sLogRateLimitIntervalSec: %s\n",
4807 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4808 }
4809
4810 if (c->log_ratelimit_burst > 0)
4811 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4812
4813 if (c->n_log_extra_fields > 0) {
4814 size_t j;
4815
4816 for (j = 0; j < c->n_log_extra_fields; j++) {
4817 fprintf(f, "%sLogExtraFields: ", prefix);
4818 fwrite(c->log_extra_fields[j].iov_base,
4819 1, c->log_extra_fields[j].iov_len,
4820 f);
4821 fputc('\n', f);
4822 }
4823 }
4824
4825 if (c->log_namespace)
4826 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4827
4828 if (c->secure_bits) {
4829 _cleanup_free_ char *str = NULL;
4830
4831 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4832 if (r >= 0)
4833 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4834 }
4835
4836 if (c->capability_bounding_set != CAP_ALL) {
4837 _cleanup_free_ char *str = NULL;
4838
4839 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4840 if (r >= 0)
4841 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4842 }
4843
4844 if (c->capability_ambient_set != 0) {
4845 _cleanup_free_ char *str = NULL;
4846
4847 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4848 if (r >= 0)
4849 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4850 }
4851
4852 if (c->user)
4853 fprintf(f, "%sUser: %s\n", prefix, c->user);
4854 if (c->group)
4855 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4856
4857 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4858
4859 if (!strv_isempty(c->supplementary_groups)) {
4860 fprintf(f, "%sSupplementaryGroups:", prefix);
4861 strv_fprintf(f, c->supplementary_groups);
4862 fputs("\n", f);
4863 }
4864
4865 if (c->pam_name)
4866 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4867
4868 if (!strv_isempty(c->read_write_paths)) {
4869 fprintf(f, "%sReadWritePaths:", prefix);
4870 strv_fprintf(f, c->read_write_paths);
4871 fputs("\n", f);
4872 }
4873
4874 if (!strv_isempty(c->read_only_paths)) {
4875 fprintf(f, "%sReadOnlyPaths:", prefix);
4876 strv_fprintf(f, c->read_only_paths);
4877 fputs("\n", f);
4878 }
4879
4880 if (!strv_isempty(c->inaccessible_paths)) {
4881 fprintf(f, "%sInaccessiblePaths:", prefix);
4882 strv_fprintf(f, c->inaccessible_paths);
4883 fputs("\n", f);
4884 }
4885
4886 if (c->n_bind_mounts > 0)
4887 for (i = 0; i < c->n_bind_mounts; i++)
4888 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4889 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4890 c->bind_mounts[i].ignore_enoent ? "-": "",
4891 c->bind_mounts[i].source,
4892 c->bind_mounts[i].destination,
4893 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4894
4895 if (c->n_temporary_filesystems > 0)
4896 for (i = 0; i < c->n_temporary_filesystems; i++) {
4897 TemporaryFileSystem *t = c->temporary_filesystems + i;
4898
4899 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4900 t->path,
4901 isempty(t->options) ? "" : ":",
4902 strempty(t->options));
4903 }
4904
4905 if (c->utmp_id)
4906 fprintf(f,
4907 "%sUtmpIdentifier: %s\n",
4908 prefix, c->utmp_id);
4909
4910 if (c->selinux_context)
4911 fprintf(f,
4912 "%sSELinuxContext: %s%s\n",
4913 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4914
4915 if (c->apparmor_profile)
4916 fprintf(f,
4917 "%sAppArmorProfile: %s%s\n",
4918 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4919
4920 if (c->smack_process_label)
4921 fprintf(f,
4922 "%sSmackProcessLabel: %s%s\n",
4923 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4924
4925 if (c->personality != PERSONALITY_INVALID)
4926 fprintf(f,
4927 "%sPersonality: %s\n",
4928 prefix, strna(personality_to_string(c->personality)));
4929
4930 fprintf(f,
4931 "%sLockPersonality: %s\n",
4932 prefix, yes_no(c->lock_personality));
4933
4934 if (c->syscall_filter) {
4935 #if HAVE_SECCOMP
4936 Iterator j;
4937 void *id, *val;
4938 bool first = true;
4939 #endif
4940
4941 fprintf(f,
4942 "%sSystemCallFilter: ",
4943 prefix);
4944
4945 if (!c->syscall_allow_list)
4946 fputc('~', f);
4947
4948 #if HAVE_SECCOMP
4949 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4950 _cleanup_free_ char *name = NULL;
4951 const char *errno_name = NULL;
4952 int num = PTR_TO_INT(val);
4953
4954 if (first)
4955 first = false;
4956 else
4957 fputc(' ', f);
4958
4959 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4960 fputs(strna(name), f);
4961
4962 if (num >= 0) {
4963 errno_name = errno_to_name(num);
4964 if (errno_name)
4965 fprintf(f, ":%s", errno_name);
4966 else
4967 fprintf(f, ":%d", num);
4968 }
4969 }
4970 #endif
4971
4972 fputc('\n', f);
4973 }
4974
4975 if (c->syscall_archs) {
4976 #if HAVE_SECCOMP
4977 Iterator j;
4978 void *id;
4979 #endif
4980
4981 fprintf(f,
4982 "%sSystemCallArchitectures:",
4983 prefix);
4984
4985 #if HAVE_SECCOMP
4986 SET_FOREACH(id, c->syscall_archs, j)
4987 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4988 #endif
4989 fputc('\n', f);
4990 }
4991
4992 if (exec_context_restrict_namespaces_set(c)) {
4993 _cleanup_free_ char *s = NULL;
4994
4995 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4996 if (r >= 0)
4997 fprintf(f, "%sRestrictNamespaces: %s\n",
4998 prefix, strna(s));
4999 }
5000
5001 if (c->network_namespace_path)
5002 fprintf(f,
5003 "%sNetworkNamespacePath: %s\n",
5004 prefix, c->network_namespace_path);
5005
5006 if (c->syscall_errno > 0) {
5007 const char *errno_name;
5008
5009 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5010
5011 errno_name = errno_to_name(c->syscall_errno);
5012 if (errno_name)
5013 fprintf(f, "%s\n", errno_name);
5014 else
5015 fprintf(f, "%d\n", c->syscall_errno);
5016 }
5017 }
5018
5019 bool exec_context_maintains_privileges(const ExecContext *c) {
5020 assert(c);
5021
5022 /* Returns true if the process forked off would run under
5023 * an unchanged UID or as root. */
5024
5025 if (!c->user)
5026 return true;
5027
5028 if (streq(c->user, "root") || streq(c->user, "0"))
5029 return true;
5030
5031 return false;
5032 }
5033
5034 int exec_context_get_effective_ioprio(const ExecContext *c) {
5035 int p;
5036
5037 assert(c);
5038
5039 if (c->ioprio_set)
5040 return c->ioprio;
5041
5042 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5043 if (p < 0)
5044 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5045
5046 return p;
5047 }
5048
5049 void exec_context_free_log_extra_fields(ExecContext *c) {
5050 size_t l;
5051
5052 assert(c);
5053
5054 for (l = 0; l < c->n_log_extra_fields; l++)
5055 free(c->log_extra_fields[l].iov_base);
5056 c->log_extra_fields = mfree(c->log_extra_fields);
5057 c->n_log_extra_fields = 0;
5058 }
5059
5060 void exec_context_revert_tty(ExecContext *c) {
5061 int r;
5062
5063 assert(c);
5064
5065 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5066 exec_context_tty_reset(c, NULL);
5067
5068 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5069 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5070 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5071
5072 if (exec_context_may_touch_tty(c)) {
5073 const char *path;
5074
5075 path = exec_context_tty_path(c);
5076 if (path) {
5077 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5078 if (r < 0 && r != -ENOENT)
5079 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5080 }
5081 }
5082 }
5083
5084 int exec_context_get_clean_directories(
5085 ExecContext *c,
5086 char **prefix,
5087 ExecCleanMask mask,
5088 char ***ret) {
5089
5090 _cleanup_strv_free_ char **l = NULL;
5091 ExecDirectoryType t;
5092 int r;
5093
5094 assert(c);
5095 assert(prefix);
5096 assert(ret);
5097
5098 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5099 char **i;
5100
5101 if (!FLAGS_SET(mask, 1U << t))
5102 continue;
5103
5104 if (!prefix[t])
5105 continue;
5106
5107 STRV_FOREACH(i, c->directories[t].paths) {
5108 char *j;
5109
5110 j = path_join(prefix[t], *i);
5111 if (!j)
5112 return -ENOMEM;
5113
5114 r = strv_consume(&l, j);
5115 if (r < 0)
5116 return r;
5117
5118 /* Also remove private directories unconditionally. */
5119 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5120 j = path_join(prefix[t], "private", *i);
5121 if (!j)
5122 return -ENOMEM;
5123
5124 r = strv_consume(&l, j);
5125 if (r < 0)
5126 return r;
5127 }
5128 }
5129 }
5130
5131 *ret = TAKE_PTR(l);
5132 return 0;
5133 }
5134
5135 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5136 ExecCleanMask mask = 0;
5137
5138 assert(c);
5139 assert(ret);
5140
5141 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5142 if (!strv_isempty(c->directories[t].paths))
5143 mask |= 1U << t;
5144
5145 *ret = mask;
5146 return 0;
5147 }
5148
5149 void exec_status_start(ExecStatus *s, pid_t pid) {
5150 assert(s);
5151
5152 *s = (ExecStatus) {
5153 .pid = pid,
5154 };
5155
5156 dual_timestamp_get(&s->start_timestamp);
5157 }
5158
5159 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5160 assert(s);
5161
5162 if (s->pid != pid) {
5163 *s = (ExecStatus) {
5164 .pid = pid,
5165 };
5166 }
5167
5168 dual_timestamp_get(&s->exit_timestamp);
5169
5170 s->code = code;
5171 s->status = status;
5172
5173 if (context && context->utmp_id)
5174 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5175 }
5176
5177 void exec_status_reset(ExecStatus *s) {
5178 assert(s);
5179
5180 *s = (ExecStatus) {};
5181 }
5182
5183 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5184 char buf[FORMAT_TIMESTAMP_MAX];
5185
5186 assert(s);
5187 assert(f);
5188
5189 if (s->pid <= 0)
5190 return;
5191
5192 prefix = strempty(prefix);
5193
5194 fprintf(f,
5195 "%sPID: "PID_FMT"\n",
5196 prefix, s->pid);
5197
5198 if (dual_timestamp_is_set(&s->start_timestamp))
5199 fprintf(f,
5200 "%sStart Timestamp: %s\n",
5201 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5202
5203 if (dual_timestamp_is_set(&s->exit_timestamp))
5204 fprintf(f,
5205 "%sExit Timestamp: %s\n"
5206 "%sExit Code: %s\n"
5207 "%sExit Status: %i\n",
5208 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5209 prefix, sigchld_code_to_string(s->code),
5210 prefix, s->status);
5211 }
5212
5213 static char *exec_command_line(char **argv) {
5214 size_t k;
5215 char *n, *p, **a;
5216 bool first = true;
5217
5218 assert(argv);
5219
5220 k = 1;
5221 STRV_FOREACH(a, argv)
5222 k += strlen(*a)+3;
5223
5224 n = new(char, k);
5225 if (!n)
5226 return NULL;
5227
5228 p = n;
5229 STRV_FOREACH(a, argv) {
5230
5231 if (!first)
5232 *(p++) = ' ';
5233 else
5234 first = false;
5235
5236 if (strpbrk(*a, WHITESPACE)) {
5237 *(p++) = '\'';
5238 p = stpcpy(p, *a);
5239 *(p++) = '\'';
5240 } else
5241 p = stpcpy(p, *a);
5242
5243 }
5244
5245 *p = 0;
5246
5247 /* FIXME: this doesn't really handle arguments that have
5248 * spaces and ticks in them */
5249
5250 return n;
5251 }
5252
5253 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5254 _cleanup_free_ char *cmd = NULL;
5255 const char *prefix2;
5256
5257 assert(c);
5258 assert(f);
5259
5260 prefix = strempty(prefix);
5261 prefix2 = strjoina(prefix, "\t");
5262
5263 cmd = exec_command_line(c->argv);
5264 fprintf(f,
5265 "%sCommand Line: %s\n",
5266 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5267
5268 exec_status_dump(&c->exec_status, f, prefix2);
5269 }
5270
5271 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5272 assert(f);
5273
5274 prefix = strempty(prefix);
5275
5276 LIST_FOREACH(command, c, c)
5277 exec_command_dump(c, f, prefix);
5278 }
5279
5280 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5281 ExecCommand *end;
5282
5283 assert(l);
5284 assert(e);
5285
5286 if (*l) {
5287 /* It's kind of important, that we keep the order here */
5288 LIST_FIND_TAIL(command, *l, end);
5289 LIST_INSERT_AFTER(command, *l, end, e);
5290 } else
5291 *l = e;
5292 }
5293
5294 int exec_command_set(ExecCommand *c, const char *path, ...) {
5295 va_list ap;
5296 char **l, *p;
5297
5298 assert(c);
5299 assert(path);
5300
5301 va_start(ap, path);
5302 l = strv_new_ap(path, ap);
5303 va_end(ap);
5304
5305 if (!l)
5306 return -ENOMEM;
5307
5308 p = strdup(path);
5309 if (!p) {
5310 strv_free(l);
5311 return -ENOMEM;
5312 }
5313
5314 free_and_replace(c->path, p);
5315
5316 return strv_free_and_replace(c->argv, l);
5317 }
5318
5319 int exec_command_append(ExecCommand *c, const char *path, ...) {
5320 _cleanup_strv_free_ char **l = NULL;
5321 va_list ap;
5322 int r;
5323
5324 assert(c);
5325 assert(path);
5326
5327 va_start(ap, path);
5328 l = strv_new_ap(path, ap);
5329 va_end(ap);
5330
5331 if (!l)
5332 return -ENOMEM;
5333
5334 r = strv_extend_strv(&c->argv, l, false);
5335 if (r < 0)
5336 return r;
5337
5338 return 0;
5339 }
5340
5341 static void *remove_tmpdir_thread(void *p) {
5342 _cleanup_free_ char *path = p;
5343
5344 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5345 return NULL;
5346 }
5347
5348 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5349 int r;
5350
5351 if (!rt)
5352 return NULL;
5353
5354 if (rt->manager)
5355 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5356
5357 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5358
5359 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
5360 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5361
5362 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5363 if (r < 0)
5364 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5365 else
5366 rt->tmp_dir = NULL;
5367 }
5368
5369 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
5370 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5371
5372 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5373 if (r < 0)
5374 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5375 else
5376 rt->var_tmp_dir = NULL;
5377 }
5378
5379 rt->id = mfree(rt->id);
5380 rt->tmp_dir = mfree(rt->tmp_dir);
5381 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5382 safe_close_pair(rt->netns_storage_socket);
5383 return mfree(rt);
5384 }
5385
5386 static void exec_runtime_freep(ExecRuntime **rt) {
5387 (void) exec_runtime_free(*rt, false);
5388 }
5389
5390 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
5391 _cleanup_free_ char *id_copy = NULL;
5392 ExecRuntime *n;
5393
5394 assert(ret);
5395
5396 id_copy = strdup(id);
5397 if (!id_copy)
5398 return -ENOMEM;
5399
5400 n = new(ExecRuntime, 1);
5401 if (!n)
5402 return -ENOMEM;
5403
5404 *n = (ExecRuntime) {
5405 .id = TAKE_PTR(id_copy),
5406 .netns_storage_socket = { -1, -1 },
5407 };
5408
5409 *ret = n;
5410 return 0;
5411 }
5412
5413 static int exec_runtime_add(
5414 Manager *m,
5415 const char *id,
5416 char **tmp_dir,
5417 char **var_tmp_dir,
5418 int netns_storage_socket[2],
5419 ExecRuntime **ret) {
5420
5421 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5422 int r;
5423
5424 assert(m);
5425 assert(id);
5426
5427 /* tmp_dir, var_tmp_dir, netns_storage_socket fds are donated on success */
5428
5429 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5430 if (r < 0)
5431 return r;
5432
5433 r = exec_runtime_allocate(&rt, id);
5434 if (r < 0)
5435 return r;
5436
5437 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5438 if (r < 0)
5439 return r;
5440
5441 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
5442 rt->tmp_dir = TAKE_PTR(*tmp_dir);
5443 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
5444
5445 if (netns_storage_socket) {
5446 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
5447 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
5448 }
5449
5450 rt->manager = m;
5451
5452 if (ret)
5453 *ret = rt;
5454 /* do not remove created ExecRuntime object when the operation succeeds. */
5455 TAKE_PTR(rt);
5456 return 0;
5457 }
5458
5459 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5460 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
5461 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5462 int r;
5463
5464 assert(m);
5465 assert(c);
5466 assert(id);
5467
5468 /* It is not necessary to create ExecRuntime object. */
5469 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5470 return 0;
5471
5472 if (c->private_tmp &&
5473 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
5474 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
5475 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
5476 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5477 if (r < 0)
5478 return r;
5479 }
5480
5481 if (c->private_network || c->network_namespace_path) {
5482 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5483 return -errno;
5484 }
5485
5486 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ret);
5487 if (r < 0)
5488 return r;
5489
5490 return 1;
5491 }
5492
5493 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5494 ExecRuntime *rt;
5495 int r;
5496
5497 assert(m);
5498 assert(id);
5499 assert(ret);
5500
5501 rt = hashmap_get(m->exec_runtime_by_id, id);
5502 if (rt)
5503 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5504 goto ref;
5505
5506 if (!create)
5507 return 0;
5508
5509 /* If not found, then create a new object. */
5510 r = exec_runtime_make(m, c, id, &rt);
5511 if (r <= 0)
5512 /* When r == 0, it is not necessary to create ExecRuntime object. */
5513 return r;
5514
5515 ref:
5516 /* increment reference counter. */
5517 rt->n_ref++;
5518 *ret = rt;
5519 return 1;
5520 }
5521
5522 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5523 if (!rt)
5524 return NULL;
5525
5526 assert(rt->n_ref > 0);
5527
5528 rt->n_ref--;
5529 if (rt->n_ref > 0)
5530 return NULL;
5531
5532 return exec_runtime_free(rt, destroy);
5533 }
5534
5535 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5536 ExecRuntime *rt;
5537 Iterator i;
5538
5539 assert(m);
5540 assert(f);
5541 assert(fds);
5542
5543 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5544 fprintf(f, "exec-runtime=%s", rt->id);
5545
5546 if (rt->tmp_dir)
5547 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5548
5549 if (rt->var_tmp_dir)
5550 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5551
5552 if (rt->netns_storage_socket[0] >= 0) {
5553 int copy;
5554
5555 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5556 if (copy < 0)
5557 return copy;
5558
5559 fprintf(f, " netns-socket-0=%i", copy);
5560 }
5561
5562 if (rt->netns_storage_socket[1] >= 0) {
5563 int copy;
5564
5565 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5566 if (copy < 0)
5567 return copy;
5568
5569 fprintf(f, " netns-socket-1=%i", copy);
5570 }
5571
5572 fputc('\n', f);
5573 }
5574
5575 return 0;
5576 }
5577
5578 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5579 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5580 ExecRuntime *rt;
5581 int r;
5582
5583 /* This is for the migration from old (v237 or earlier) deserialization text.
5584 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5585 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5586 * so or not from the serialized text, then we always creates a new object owned by this. */
5587
5588 assert(u);
5589 assert(key);
5590 assert(value);
5591
5592 /* Manager manages ExecRuntime objects by the unit id.
5593 * So, we omit the serialized text when the unit does not have id (yet?)... */
5594 if (isempty(u->id)) {
5595 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5596 return 0;
5597 }
5598
5599 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5600 if (r < 0) {
5601 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5602 return 0;
5603 }
5604
5605 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5606 if (!rt) {
5607 r = exec_runtime_allocate(&rt_create, u->id);
5608 if (r < 0)
5609 return log_oom();
5610
5611 rt = rt_create;
5612 }
5613
5614 if (streq(key, "tmp-dir")) {
5615 char *copy;
5616
5617 copy = strdup(value);
5618 if (!copy)
5619 return log_oom();
5620
5621 free_and_replace(rt->tmp_dir, copy);
5622
5623 } else if (streq(key, "var-tmp-dir")) {
5624 char *copy;
5625
5626 copy = strdup(value);
5627 if (!copy)
5628 return log_oom();
5629
5630 free_and_replace(rt->var_tmp_dir, copy);
5631
5632 } else if (streq(key, "netns-socket-0")) {
5633 int fd;
5634
5635 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5636 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5637 return 0;
5638 }
5639
5640 safe_close(rt->netns_storage_socket[0]);
5641 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5642
5643 } else if (streq(key, "netns-socket-1")) {
5644 int fd;
5645
5646 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5647 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5648 return 0;
5649 }
5650
5651 safe_close(rt->netns_storage_socket[1]);
5652 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5653 } else
5654 return 0;
5655
5656 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5657 if (rt_create) {
5658 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5659 if (r < 0) {
5660 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5661 return 0;
5662 }
5663
5664 rt_create->manager = u->manager;
5665
5666 /* Avoid cleanup */
5667 TAKE_PTR(rt_create);
5668 }
5669
5670 return 1;
5671 }
5672
5673 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5674 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5675 char *id = NULL;
5676 int r, fdpair[] = {-1, -1};
5677 const char *p, *v = value;
5678 size_t n;
5679
5680 assert(m);
5681 assert(value);
5682 assert(fds);
5683
5684 n = strcspn(v, " ");
5685 id = strndupa(v, n);
5686 if (v[n] != ' ')
5687 goto finalize;
5688 p = v + n + 1;
5689
5690 v = startswith(p, "tmp-dir=");
5691 if (v) {
5692 n = strcspn(v, " ");
5693 tmp_dir = strndup(v, n);
5694 if (!tmp_dir)
5695 return log_oom();
5696 if (v[n] != ' ')
5697 goto finalize;
5698 p = v + n + 1;
5699 }
5700
5701 v = startswith(p, "var-tmp-dir=");
5702 if (v) {
5703 n = strcspn(v, " ");
5704 var_tmp_dir = strndup(v, n);
5705 if (!var_tmp_dir)
5706 return log_oom();
5707 if (v[n] != ' ')
5708 goto finalize;
5709 p = v + n + 1;
5710 }
5711
5712 v = startswith(p, "netns-socket-0=");
5713 if (v) {
5714 char *buf;
5715
5716 n = strcspn(v, " ");
5717 buf = strndupa(v, n);
5718 if (safe_atoi(buf, &fdpair[0]) < 0 || !fdset_contains(fds, fdpair[0]))
5719 return log_debug("Unable to process exec-runtime netns fd specification.");
5720 fdpair[0] = fdset_remove(fds, fdpair[0]);
5721 if (v[n] != ' ')
5722 goto finalize;
5723 p = v + n + 1;
5724 }
5725
5726 v = startswith(p, "netns-socket-1=");
5727 if (v) {
5728 char *buf;
5729
5730 n = strcspn(v, " ");
5731 buf = strndupa(v, n);
5732 if (safe_atoi(buf, &fdpair[1]) < 0 || !fdset_contains(fds, fdpair[1]))
5733 return log_debug("Unable to process exec-runtime netns fd specification.");
5734 fdpair[1] = fdset_remove(fds, fdpair[1]);
5735 }
5736
5737 finalize:
5738 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, fdpair, NULL);
5739 if (r < 0)
5740 return log_debug_errno(r, "Failed to add exec-runtime: %m");
5741 return 0;
5742 }
5743
5744 void exec_runtime_vacuum(Manager *m) {
5745 ExecRuntime *rt;
5746 Iterator i;
5747
5748 assert(m);
5749
5750 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5751
5752 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5753 if (rt->n_ref > 0)
5754 continue;
5755
5756 (void) exec_runtime_free(rt, false);
5757 }
5758 }
5759
5760 void exec_params_clear(ExecParameters *p) {
5761 if (!p)
5762 return;
5763
5764 strv_free(p->environment);
5765 }
5766
5767 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5768 [EXEC_INPUT_NULL] = "null",
5769 [EXEC_INPUT_TTY] = "tty",
5770 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5771 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5772 [EXEC_INPUT_SOCKET] = "socket",
5773 [EXEC_INPUT_NAMED_FD] = "fd",
5774 [EXEC_INPUT_DATA] = "data",
5775 [EXEC_INPUT_FILE] = "file",
5776 };
5777
5778 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5779
5780 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5781 [EXEC_OUTPUT_INHERIT] = "inherit",
5782 [EXEC_OUTPUT_NULL] = "null",
5783 [EXEC_OUTPUT_TTY] = "tty",
5784 [EXEC_OUTPUT_KMSG] = "kmsg",
5785 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5786 [EXEC_OUTPUT_JOURNAL] = "journal",
5787 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5788 [EXEC_OUTPUT_SOCKET] = "socket",
5789 [EXEC_OUTPUT_NAMED_FD] = "fd",
5790 [EXEC_OUTPUT_FILE] = "file",
5791 [EXEC_OUTPUT_FILE_APPEND] = "append",
5792 };
5793
5794 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5795
5796 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5797 [EXEC_UTMP_INIT] = "init",
5798 [EXEC_UTMP_LOGIN] = "login",
5799 [EXEC_UTMP_USER] = "user",
5800 };
5801
5802 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5803
5804 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5805 [EXEC_PRESERVE_NO] = "no",
5806 [EXEC_PRESERVE_YES] = "yes",
5807 [EXEC_PRESERVE_RESTART] = "restart",
5808 };
5809
5810 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5811
5812 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5813 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5814 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5815 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5816 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5817 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5818 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5819 };
5820
5821 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5822
5823 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5824 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5825 * directories, specifically .timer units with their timestamp touch file. */
5826 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5827 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5828 [EXEC_DIRECTORY_STATE] = "state",
5829 [EXEC_DIRECTORY_CACHE] = "cache",
5830 [EXEC_DIRECTORY_LOGS] = "logs",
5831 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5832 };
5833
5834 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5835
5836 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5837 * the service payload in. */
5838 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5839 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5840 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5841 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5842 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5843 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5844 };
5845
5846 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5847
5848 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5849 [EXEC_KEYRING_INHERIT] = "inherit",
5850 [EXEC_KEYRING_PRIVATE] = "private",
5851 [EXEC_KEYRING_SHARED] = "shared",
5852 };
5853
5854 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);