]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #14712 from wlhlm/root-storage-daemons-docs-fix
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/personality.h>
10 #include <sys/prctl.h>
11 #include <sys/shm.h>
12 #include <sys/types.h>
13 #include <sys/un.h>
14 #include <unistd.h>
15 #include <utmpx.h>
16
17 #if HAVE_PAM
18 #include <security/pam_appl.h>
19 #endif
20
21 #if HAVE_SELINUX
22 #include <selinux/selinux.h>
23 #endif
24
25 #if HAVE_SECCOMP
26 #include <seccomp.h>
27 #endif
28
29 #if HAVE_APPARMOR
30 #include <sys/apparmor.h>
31 #endif
32
33 #include "sd-messages.h"
34
35 #include "af-list.h"
36 #include "alloc-util.h"
37 #if HAVE_APPARMOR
38 #include "apparmor-util.h"
39 #endif
40 #include "async.h"
41 #include "barrier.h"
42 #include "cap-list.h"
43 #include "capability-util.h"
44 #include "chown-recursive.h"
45 #include "cgroup-setup.h"
46 #include "cpu-set-util.h"
47 #include "def.h"
48 #include "env-file.h"
49 #include "env-util.h"
50 #include "errno-list.h"
51 #include "execute.h"
52 #include "exit-status.h"
53 #include "fd-util.h"
54 #include "format-util.h"
55 #include "fs-util.h"
56 #include "glob-util.h"
57 #include "io-util.h"
58 #include "ioprio.h"
59 #include "label.h"
60 #include "log.h"
61 #include "macro.h"
62 #include "manager.h"
63 #include "memory-util.h"
64 #include "missing_fs.h"
65 #include "mkdir.h"
66 #include "namespace.h"
67 #include "parse-util.h"
68 #include "path-util.h"
69 #include "process-util.h"
70 #include "rlimit-util.h"
71 #include "rm-rf.h"
72 #if HAVE_SECCOMP
73 #include "seccomp-util.h"
74 #endif
75 #include "securebits-util.h"
76 #include "selinux-util.h"
77 #include "signal-util.h"
78 #include "smack-util.h"
79 #include "socket-util.h"
80 #include "special.h"
81 #include "stat-util.h"
82 #include "string-table.h"
83 #include "string-util.h"
84 #include "strv.h"
85 #include "syslog-util.h"
86 #include "terminal-util.h"
87 #include "umask-util.h"
88 #include "unit.h"
89 #include "user-util.h"
90 #include "utmp-wtmp.h"
91
92 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
93 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
94
95 #define SNDBUF_SIZE (8*1024*1024)
96
97 static int shift_fds(int fds[], size_t n_fds) {
98 int start, restart_from;
99
100 if (n_fds <= 0)
101 return 0;
102
103 /* Modifies the fds array! (sorts it) */
104
105 assert(fds);
106
107 start = 0;
108 for (;;) {
109 int i;
110
111 restart_from = -1;
112
113 for (i = start; i < (int) n_fds; i++) {
114 int nfd;
115
116 /* Already at right index? */
117 if (fds[i] == i+3)
118 continue;
119
120 nfd = fcntl(fds[i], F_DUPFD, i + 3);
121 if (nfd < 0)
122 return -errno;
123
124 safe_close(fds[i]);
125 fds[i] = nfd;
126
127 /* Hmm, the fd we wanted isn't free? Then
128 * let's remember that and try again from here */
129 if (nfd != i+3 && restart_from < 0)
130 restart_from = i;
131 }
132
133 if (restart_from < 0)
134 break;
135
136 start = restart_from;
137 }
138
139 return 0;
140 }
141
142 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
143 size_t i, n_fds;
144 int r;
145
146 n_fds = n_socket_fds + n_storage_fds;
147 if (n_fds <= 0)
148 return 0;
149
150 assert(fds);
151
152 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
153 * O_NONBLOCK only applies to socket activation though. */
154
155 for (i = 0; i < n_fds; i++) {
156
157 if (i < n_socket_fds) {
158 r = fd_nonblock(fds[i], nonblock);
159 if (r < 0)
160 return r;
161 }
162
163 /* We unconditionally drop FD_CLOEXEC from the fds,
164 * since after all we want to pass these fds to our
165 * children */
166
167 r = fd_cloexec(fds[i], false);
168 if (r < 0)
169 return r;
170 }
171
172 return 0;
173 }
174
175 static const char *exec_context_tty_path(const ExecContext *context) {
176 assert(context);
177
178 if (context->stdio_as_fds)
179 return NULL;
180
181 if (context->tty_path)
182 return context->tty_path;
183
184 return "/dev/console";
185 }
186
187 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
188 const char *path;
189
190 assert(context);
191
192 path = exec_context_tty_path(context);
193
194 if (context->tty_vhangup) {
195 if (p && p->stdin_fd >= 0)
196 (void) terminal_vhangup_fd(p->stdin_fd);
197 else if (path)
198 (void) terminal_vhangup(path);
199 }
200
201 if (context->tty_reset) {
202 if (p && p->stdin_fd >= 0)
203 (void) reset_terminal_fd(p->stdin_fd, true);
204 else if (path)
205 (void) reset_terminal(path);
206 }
207
208 if (context->tty_vt_disallocate && path)
209 (void) vt_disallocate(path);
210 }
211
212 static bool is_terminal_input(ExecInput i) {
213 return IN_SET(i,
214 EXEC_INPUT_TTY,
215 EXEC_INPUT_TTY_FORCE,
216 EXEC_INPUT_TTY_FAIL);
217 }
218
219 static bool is_terminal_output(ExecOutput o) {
220 return IN_SET(o,
221 EXEC_OUTPUT_TTY,
222 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
223 EXEC_OUTPUT_KMSG_AND_CONSOLE,
224 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
225 }
226
227 static bool is_syslog_output(ExecOutput o) {
228 return IN_SET(o,
229 EXEC_OUTPUT_SYSLOG,
230 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
231 }
232
233 static bool is_kmsg_output(ExecOutput o) {
234 return IN_SET(o,
235 EXEC_OUTPUT_KMSG,
236 EXEC_OUTPUT_KMSG_AND_CONSOLE);
237 }
238
239 static bool exec_context_needs_term(const ExecContext *c) {
240 assert(c);
241
242 /* Return true if the execution context suggests we should set $TERM to something useful. */
243
244 if (is_terminal_input(c->std_input))
245 return true;
246
247 if (is_terminal_output(c->std_output))
248 return true;
249
250 if (is_terminal_output(c->std_error))
251 return true;
252
253 return !!c->tty_path;
254 }
255
256 static int open_null_as(int flags, int nfd) {
257 int fd;
258
259 assert(nfd >= 0);
260
261 fd = open("/dev/null", flags|O_NOCTTY);
262 if (fd < 0)
263 return -errno;
264
265 return move_fd(fd, nfd, false);
266 }
267
268 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
269 static const union sockaddr_union sa = {
270 .un.sun_family = AF_UNIX,
271 .un.sun_path = "/run/systemd/journal/stdout",
272 };
273 uid_t olduid = UID_INVALID;
274 gid_t oldgid = GID_INVALID;
275 int r;
276
277 if (gid_is_valid(gid)) {
278 oldgid = getgid();
279
280 if (setegid(gid) < 0)
281 return -errno;
282 }
283
284 if (uid_is_valid(uid)) {
285 olduid = getuid();
286
287 if (seteuid(uid) < 0) {
288 r = -errno;
289 goto restore_gid;
290 }
291 }
292
293 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
294
295 /* If we fail to restore the uid or gid, things will likely
296 fail later on. This should only happen if an LSM interferes. */
297
298 if (uid_is_valid(uid))
299 (void) seteuid(olduid);
300
301 restore_gid:
302 if (gid_is_valid(gid))
303 (void) setegid(oldgid);
304
305 return r;
306 }
307
308 static int connect_logger_as(
309 const Unit *unit,
310 const ExecContext *context,
311 const ExecParameters *params,
312 ExecOutput output,
313 const char *ident,
314 int nfd,
315 uid_t uid,
316 gid_t gid) {
317
318 _cleanup_close_ int fd = -1;
319 int r;
320
321 assert(context);
322 assert(params);
323 assert(output < _EXEC_OUTPUT_MAX);
324 assert(ident);
325 assert(nfd >= 0);
326
327 fd = socket(AF_UNIX, SOCK_STREAM, 0);
328 if (fd < 0)
329 return -errno;
330
331 r = connect_journal_socket(fd, uid, gid);
332 if (r < 0)
333 return r;
334
335 if (shutdown(fd, SHUT_RD) < 0)
336 return -errno;
337
338 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
339
340 if (dprintf(fd,
341 "%s\n"
342 "%s\n"
343 "%i\n"
344 "%i\n"
345 "%i\n"
346 "%i\n"
347 "%i\n",
348 context->syslog_identifier ?: ident,
349 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
350 context->syslog_priority,
351 !!context->syslog_level_prefix,
352 is_syslog_output(output),
353 is_kmsg_output(output),
354 is_terminal_output(output)) < 0)
355 return -errno;
356
357 return move_fd(TAKE_FD(fd), nfd, false);
358 }
359
360 static int open_terminal_as(const char *path, int flags, int nfd) {
361 int fd;
362
363 assert(path);
364 assert(nfd >= 0);
365
366 fd = open_terminal(path, flags | O_NOCTTY);
367 if (fd < 0)
368 return fd;
369
370 return move_fd(fd, nfd, false);
371 }
372
373 static int acquire_path(const char *path, int flags, mode_t mode) {
374 union sockaddr_union sa = {};
375 _cleanup_close_ int fd = -1;
376 int r, salen;
377
378 assert(path);
379
380 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
381 flags |= O_CREAT;
382
383 fd = open(path, flags|O_NOCTTY, mode);
384 if (fd >= 0)
385 return TAKE_FD(fd);
386
387 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
388 return -errno;
389 if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
390 return -ENXIO;
391
392 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
393
394 fd = socket(AF_UNIX, SOCK_STREAM, 0);
395 if (fd < 0)
396 return -errno;
397
398 salen = sockaddr_un_set_path(&sa.un, path);
399 if (salen < 0)
400 return salen;
401
402 if (connect(fd, &sa.sa, salen) < 0)
403 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
404 * indication that his wasn't an AF_UNIX socket after all */
405
406 if ((flags & O_ACCMODE) == O_RDONLY)
407 r = shutdown(fd, SHUT_WR);
408 else if ((flags & O_ACCMODE) == O_WRONLY)
409 r = shutdown(fd, SHUT_RD);
410 else
411 return TAKE_FD(fd);
412 if (r < 0)
413 return -errno;
414
415 return TAKE_FD(fd);
416 }
417
418 static int fixup_input(
419 const ExecContext *context,
420 int socket_fd,
421 bool apply_tty_stdin) {
422
423 ExecInput std_input;
424
425 assert(context);
426
427 std_input = context->std_input;
428
429 if (is_terminal_input(std_input) && !apply_tty_stdin)
430 return EXEC_INPUT_NULL;
431
432 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
433 return EXEC_INPUT_NULL;
434
435 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
436 return EXEC_INPUT_NULL;
437
438 return std_input;
439 }
440
441 static int fixup_output(ExecOutput std_output, int socket_fd) {
442
443 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
444 return EXEC_OUTPUT_INHERIT;
445
446 return std_output;
447 }
448
449 static int setup_input(
450 const ExecContext *context,
451 const ExecParameters *params,
452 int socket_fd,
453 const int named_iofds[static 3]) {
454
455 ExecInput i;
456
457 assert(context);
458 assert(params);
459 assert(named_iofds);
460
461 if (params->stdin_fd >= 0) {
462 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
463 return -errno;
464
465 /* Try to make this the controlling tty, if it is a tty, and reset it */
466 if (isatty(STDIN_FILENO)) {
467 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
468 (void) reset_terminal_fd(STDIN_FILENO, true);
469 }
470
471 return STDIN_FILENO;
472 }
473
474 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
475
476 switch (i) {
477
478 case EXEC_INPUT_NULL:
479 return open_null_as(O_RDONLY, STDIN_FILENO);
480
481 case EXEC_INPUT_TTY:
482 case EXEC_INPUT_TTY_FORCE:
483 case EXEC_INPUT_TTY_FAIL: {
484 int fd;
485
486 fd = acquire_terminal(exec_context_tty_path(context),
487 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
488 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
489 ACQUIRE_TERMINAL_WAIT,
490 USEC_INFINITY);
491 if (fd < 0)
492 return fd;
493
494 return move_fd(fd, STDIN_FILENO, false);
495 }
496
497 case EXEC_INPUT_SOCKET:
498 assert(socket_fd >= 0);
499
500 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
501
502 case EXEC_INPUT_NAMED_FD:
503 assert(named_iofds[STDIN_FILENO] >= 0);
504
505 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
506 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
507
508 case EXEC_INPUT_DATA: {
509 int fd;
510
511 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
512 if (fd < 0)
513 return fd;
514
515 return move_fd(fd, STDIN_FILENO, false);
516 }
517
518 case EXEC_INPUT_FILE: {
519 bool rw;
520 int fd;
521
522 assert(context->stdio_file[STDIN_FILENO]);
523
524 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
525 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
526
527 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
528 if (fd < 0)
529 return fd;
530
531 return move_fd(fd, STDIN_FILENO, false);
532 }
533
534 default:
535 assert_not_reached("Unknown input type");
536 }
537 }
538
539 static bool can_inherit_stderr_from_stdout(
540 const ExecContext *context,
541 ExecOutput o,
542 ExecOutput e) {
543
544 assert(context);
545
546 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
547 * stderr fd */
548
549 if (e == EXEC_OUTPUT_INHERIT)
550 return true;
551 if (e != o)
552 return false;
553
554 if (e == EXEC_OUTPUT_NAMED_FD)
555 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
556
557 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
558 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
559
560 return true;
561 }
562
563 static int setup_output(
564 const Unit *unit,
565 const ExecContext *context,
566 const ExecParameters *params,
567 int fileno,
568 int socket_fd,
569 const int named_iofds[static 3],
570 const char *ident,
571 uid_t uid,
572 gid_t gid,
573 dev_t *journal_stream_dev,
574 ino_t *journal_stream_ino) {
575
576 ExecOutput o;
577 ExecInput i;
578 int r;
579
580 assert(unit);
581 assert(context);
582 assert(params);
583 assert(ident);
584 assert(journal_stream_dev);
585 assert(journal_stream_ino);
586
587 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
588
589 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
590 return -errno;
591
592 return STDOUT_FILENO;
593 }
594
595 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
596 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
597 return -errno;
598
599 return STDERR_FILENO;
600 }
601
602 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
603 o = fixup_output(context->std_output, socket_fd);
604
605 if (fileno == STDERR_FILENO) {
606 ExecOutput e;
607 e = fixup_output(context->std_error, socket_fd);
608
609 /* This expects the input and output are already set up */
610
611 /* Don't change the stderr file descriptor if we inherit all
612 * the way and are not on a tty */
613 if (e == EXEC_OUTPUT_INHERIT &&
614 o == EXEC_OUTPUT_INHERIT &&
615 i == EXEC_INPUT_NULL &&
616 !is_terminal_input(context->std_input) &&
617 getppid () != 1)
618 return fileno;
619
620 /* Duplicate from stdout if possible */
621 if (can_inherit_stderr_from_stdout(context, o, e))
622 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
623
624 o = e;
625
626 } else if (o == EXEC_OUTPUT_INHERIT) {
627 /* If input got downgraded, inherit the original value */
628 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
629 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
630
631 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
632 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
633 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
634
635 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
636 if (getppid() != 1)
637 return fileno;
638
639 /* We need to open /dev/null here anew, to get the right access mode. */
640 return open_null_as(O_WRONLY, fileno);
641 }
642
643 switch (o) {
644
645 case EXEC_OUTPUT_NULL:
646 return open_null_as(O_WRONLY, fileno);
647
648 case EXEC_OUTPUT_TTY:
649 if (is_terminal_input(i))
650 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
651
652 /* We don't reset the terminal if this is just about output */
653 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
654
655 case EXEC_OUTPUT_SYSLOG:
656 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
657 case EXEC_OUTPUT_KMSG:
658 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
659 case EXEC_OUTPUT_JOURNAL:
660 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
661 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
662 if (r < 0) {
663 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
664 r = open_null_as(O_WRONLY, fileno);
665 } else {
666 struct stat st;
667
668 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
669 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
670 * services to detect whether they are connected to the journal or not.
671 *
672 * If both stdout and stderr are connected to a stream then let's make sure to store the data
673 * about STDERR as that's usually the best way to do logging. */
674
675 if (fstat(fileno, &st) >= 0 &&
676 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
677 *journal_stream_dev = st.st_dev;
678 *journal_stream_ino = st.st_ino;
679 }
680 }
681 return r;
682
683 case EXEC_OUTPUT_SOCKET:
684 assert(socket_fd >= 0);
685
686 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
687
688 case EXEC_OUTPUT_NAMED_FD:
689 assert(named_iofds[fileno] >= 0);
690
691 (void) fd_nonblock(named_iofds[fileno], false);
692 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
693
694 case EXEC_OUTPUT_FILE:
695 case EXEC_OUTPUT_FILE_APPEND: {
696 bool rw;
697 int fd, flags;
698
699 assert(context->stdio_file[fileno]);
700
701 rw = context->std_input == EXEC_INPUT_FILE &&
702 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
703
704 if (rw)
705 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
706
707 flags = O_WRONLY;
708 if (o == EXEC_OUTPUT_FILE_APPEND)
709 flags |= O_APPEND;
710
711 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
712 if (fd < 0)
713 return fd;
714
715 return move_fd(fd, fileno, 0);
716 }
717
718 default:
719 assert_not_reached("Unknown error type");
720 }
721 }
722
723 static int chown_terminal(int fd, uid_t uid) {
724 int r;
725
726 assert(fd >= 0);
727
728 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
729 if (isatty(fd) < 1) {
730 if (IN_SET(errno, EINVAL, ENOTTY))
731 return 0; /* not a tty */
732
733 return -errno;
734 }
735
736 /* This might fail. What matters are the results. */
737 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
738 if (r < 0)
739 return r;
740
741 return 1;
742 }
743
744 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
745 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
746 int r;
747
748 assert(_saved_stdin);
749 assert(_saved_stdout);
750
751 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
752 if (saved_stdin < 0)
753 return -errno;
754
755 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
756 if (saved_stdout < 0)
757 return -errno;
758
759 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
760 if (fd < 0)
761 return fd;
762
763 r = chown_terminal(fd, getuid());
764 if (r < 0)
765 return r;
766
767 r = reset_terminal_fd(fd, true);
768 if (r < 0)
769 return r;
770
771 r = rearrange_stdio(fd, fd, STDERR_FILENO);
772 fd = -1;
773 if (r < 0)
774 return r;
775
776 *_saved_stdin = saved_stdin;
777 *_saved_stdout = saved_stdout;
778
779 saved_stdin = saved_stdout = -1;
780
781 return 0;
782 }
783
784 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
785 assert(err < 0);
786
787 if (err == -ETIMEDOUT)
788 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
789 else {
790 errno = -err;
791 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
792 }
793 }
794
795 static void write_confirm_error(int err, const char *vc, const Unit *u) {
796 _cleanup_close_ int fd = -1;
797
798 assert(vc);
799
800 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
801 if (fd < 0)
802 return;
803
804 write_confirm_error_fd(err, fd, u);
805 }
806
807 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
808 int r = 0;
809
810 assert(saved_stdin);
811 assert(saved_stdout);
812
813 release_terminal();
814
815 if (*saved_stdin >= 0)
816 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
817 r = -errno;
818
819 if (*saved_stdout >= 0)
820 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
821 r = -errno;
822
823 *saved_stdin = safe_close(*saved_stdin);
824 *saved_stdout = safe_close(*saved_stdout);
825
826 return r;
827 }
828
829 enum {
830 CONFIRM_PRETEND_FAILURE = -1,
831 CONFIRM_PRETEND_SUCCESS = 0,
832 CONFIRM_EXECUTE = 1,
833 };
834
835 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
836 int saved_stdout = -1, saved_stdin = -1, r;
837 _cleanup_free_ char *e = NULL;
838 char c;
839
840 /* For any internal errors, assume a positive response. */
841 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
842 if (r < 0) {
843 write_confirm_error(r, vc, u);
844 return CONFIRM_EXECUTE;
845 }
846
847 /* confirm_spawn might have been disabled while we were sleeping. */
848 if (manager_is_confirm_spawn_disabled(u->manager)) {
849 r = 1;
850 goto restore_stdio;
851 }
852
853 e = ellipsize(cmdline, 60, 100);
854 if (!e) {
855 log_oom();
856 r = CONFIRM_EXECUTE;
857 goto restore_stdio;
858 }
859
860 for (;;) {
861 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
862 if (r < 0) {
863 write_confirm_error_fd(r, STDOUT_FILENO, u);
864 r = CONFIRM_EXECUTE;
865 goto restore_stdio;
866 }
867
868 switch (c) {
869 case 'c':
870 printf("Resuming normal execution.\n");
871 manager_disable_confirm_spawn();
872 r = 1;
873 break;
874 case 'D':
875 unit_dump(u, stdout, " ");
876 continue; /* ask again */
877 case 'f':
878 printf("Failing execution.\n");
879 r = CONFIRM_PRETEND_FAILURE;
880 break;
881 case 'h':
882 printf(" c - continue, proceed without asking anymore\n"
883 " D - dump, show the state of the unit\n"
884 " f - fail, don't execute the command and pretend it failed\n"
885 " h - help\n"
886 " i - info, show a short summary of the unit\n"
887 " j - jobs, show jobs that are in progress\n"
888 " s - skip, don't execute the command and pretend it succeeded\n"
889 " y - yes, execute the command\n");
890 continue; /* ask again */
891 case 'i':
892 printf(" Description: %s\n"
893 " Unit: %s\n"
894 " Command: %s\n",
895 u->id, u->description, cmdline);
896 continue; /* ask again */
897 case 'j':
898 manager_dump_jobs(u->manager, stdout, " ");
899 continue; /* ask again */
900 case 'n':
901 /* 'n' was removed in favor of 'f'. */
902 printf("Didn't understand 'n', did you mean 'f'?\n");
903 continue; /* ask again */
904 case 's':
905 printf("Skipping execution.\n");
906 r = CONFIRM_PRETEND_SUCCESS;
907 break;
908 case 'y':
909 r = CONFIRM_EXECUTE;
910 break;
911 default:
912 assert_not_reached("Unhandled choice");
913 }
914 break;
915 }
916
917 restore_stdio:
918 restore_confirm_stdio(&saved_stdin, &saved_stdout);
919 return r;
920 }
921
922 static int get_fixed_user(const ExecContext *c, const char **user,
923 uid_t *uid, gid_t *gid,
924 const char **home, const char **shell) {
925 int r;
926 const char *name;
927
928 assert(c);
929
930 if (!c->user)
931 return 0;
932
933 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
934 * (i.e. are "/" or "/bin/nologin"). */
935
936 name = c->user;
937 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
938 if (r < 0)
939 return r;
940
941 *user = name;
942 return 0;
943 }
944
945 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
946 int r;
947 const char *name;
948
949 assert(c);
950
951 if (!c->group)
952 return 0;
953
954 name = c->group;
955 r = get_group_creds(&name, gid, 0);
956 if (r < 0)
957 return r;
958
959 *group = name;
960 return 0;
961 }
962
963 static int get_supplementary_groups(const ExecContext *c, const char *user,
964 const char *group, gid_t gid,
965 gid_t **supplementary_gids, int *ngids) {
966 char **i;
967 int r, k = 0;
968 int ngroups_max;
969 bool keep_groups = false;
970 gid_t *groups = NULL;
971 _cleanup_free_ gid_t *l_gids = NULL;
972
973 assert(c);
974
975 /*
976 * If user is given, then lookup GID and supplementary groups list.
977 * We avoid NSS lookups for gid=0. Also we have to initialize groups
978 * here and as early as possible so we keep the list of supplementary
979 * groups of the caller.
980 */
981 if (user && gid_is_valid(gid) && gid != 0) {
982 /* First step, initialize groups from /etc/groups */
983 if (initgroups(user, gid) < 0)
984 return -errno;
985
986 keep_groups = true;
987 }
988
989 if (strv_isempty(c->supplementary_groups))
990 return 0;
991
992 /*
993 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
994 * be positive, otherwise fail.
995 */
996 errno = 0;
997 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
998 if (ngroups_max <= 0)
999 return errno_or_else(EOPNOTSUPP);
1000
1001 l_gids = new(gid_t, ngroups_max);
1002 if (!l_gids)
1003 return -ENOMEM;
1004
1005 if (keep_groups) {
1006 /*
1007 * Lookup the list of groups that the user belongs to, we
1008 * avoid NSS lookups here too for gid=0.
1009 */
1010 k = ngroups_max;
1011 if (getgrouplist(user, gid, l_gids, &k) < 0)
1012 return -EINVAL;
1013 } else
1014 k = 0;
1015
1016 STRV_FOREACH(i, c->supplementary_groups) {
1017 const char *g;
1018
1019 if (k >= ngroups_max)
1020 return -E2BIG;
1021
1022 g = *i;
1023 r = get_group_creds(&g, l_gids+k, 0);
1024 if (r < 0)
1025 return r;
1026
1027 k++;
1028 }
1029
1030 /*
1031 * Sets ngids to zero to drop all supplementary groups, happens
1032 * when we are under root and SupplementaryGroups= is empty.
1033 */
1034 if (k == 0) {
1035 *ngids = 0;
1036 return 0;
1037 }
1038
1039 /* Otherwise get the final list of supplementary groups */
1040 groups = memdup(l_gids, sizeof(gid_t) * k);
1041 if (!groups)
1042 return -ENOMEM;
1043
1044 *supplementary_gids = groups;
1045 *ngids = k;
1046
1047 groups = NULL;
1048
1049 return 0;
1050 }
1051
1052 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1053 int r;
1054
1055 /* Handle SupplementaryGroups= if it is not empty */
1056 if (ngids > 0) {
1057 r = maybe_setgroups(ngids, supplementary_gids);
1058 if (r < 0)
1059 return r;
1060 }
1061
1062 if (gid_is_valid(gid)) {
1063 /* Then set our gids */
1064 if (setresgid(gid, gid, gid) < 0)
1065 return -errno;
1066 }
1067
1068 return 0;
1069 }
1070
1071 static int enforce_user(const ExecContext *context, uid_t uid) {
1072 assert(context);
1073
1074 if (!uid_is_valid(uid))
1075 return 0;
1076
1077 /* Sets (but doesn't look up) the uid and make sure we keep the
1078 * capabilities while doing so. */
1079
1080 if (context->capability_ambient_set != 0) {
1081
1082 /* First step: If we need to keep capabilities but
1083 * drop privileges we need to make sure we keep our
1084 * caps, while we drop privileges. */
1085 if (uid != 0) {
1086 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1087
1088 if (prctl(PR_GET_SECUREBITS) != sb)
1089 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1090 return -errno;
1091 }
1092 }
1093
1094 /* Second step: actually set the uids */
1095 if (setresuid(uid, uid, uid) < 0)
1096 return -errno;
1097
1098 /* At this point we should have all necessary capabilities but
1099 are otherwise a normal user. However, the caps might got
1100 corrupted due to the setresuid() so we need clean them up
1101 later. This is done outside of this call. */
1102
1103 return 0;
1104 }
1105
1106 #if HAVE_PAM
1107
1108 static int null_conv(
1109 int num_msg,
1110 const struct pam_message **msg,
1111 struct pam_response **resp,
1112 void *appdata_ptr) {
1113
1114 /* We don't support conversations */
1115
1116 return PAM_CONV_ERR;
1117 }
1118
1119 #endif
1120
1121 static int setup_pam(
1122 const char *name,
1123 const char *user,
1124 uid_t uid,
1125 gid_t gid,
1126 const char *tty,
1127 char ***env,
1128 const int fds[], size_t n_fds) {
1129
1130 #if HAVE_PAM
1131
1132 static const struct pam_conv conv = {
1133 .conv = null_conv,
1134 .appdata_ptr = NULL
1135 };
1136
1137 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1138 pam_handle_t *handle = NULL;
1139 sigset_t old_ss;
1140 int pam_code = PAM_SUCCESS, r;
1141 char **nv, **e = NULL;
1142 bool close_session = false;
1143 pid_t pam_pid = 0, parent_pid;
1144 int flags = 0;
1145
1146 assert(name);
1147 assert(user);
1148 assert(env);
1149
1150 /* We set up PAM in the parent process, then fork. The child
1151 * will then stay around until killed via PR_GET_PDEATHSIG or
1152 * systemd via the cgroup logic. It will then remove the PAM
1153 * session again. The parent process will exec() the actual
1154 * daemon. We do things this way to ensure that the main PID
1155 * of the daemon is the one we initially fork()ed. */
1156
1157 r = barrier_create(&barrier);
1158 if (r < 0)
1159 goto fail;
1160
1161 if (log_get_max_level() < LOG_DEBUG)
1162 flags |= PAM_SILENT;
1163
1164 pam_code = pam_start(name, user, &conv, &handle);
1165 if (pam_code != PAM_SUCCESS) {
1166 handle = NULL;
1167 goto fail;
1168 }
1169
1170 if (!tty) {
1171 _cleanup_free_ char *q = NULL;
1172
1173 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1174 * out if that's the case, and read the TTY off it. */
1175
1176 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1177 tty = strjoina("/dev/", q);
1178 }
1179
1180 if (tty) {
1181 pam_code = pam_set_item(handle, PAM_TTY, tty);
1182 if (pam_code != PAM_SUCCESS)
1183 goto fail;
1184 }
1185
1186 STRV_FOREACH(nv, *env) {
1187 pam_code = pam_putenv(handle, *nv);
1188 if (pam_code != PAM_SUCCESS)
1189 goto fail;
1190 }
1191
1192 pam_code = pam_acct_mgmt(handle, flags);
1193 if (pam_code != PAM_SUCCESS)
1194 goto fail;
1195
1196 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1197 if (pam_code != PAM_SUCCESS)
1198 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1199
1200 pam_code = pam_open_session(handle, flags);
1201 if (pam_code != PAM_SUCCESS)
1202 goto fail;
1203
1204 close_session = true;
1205
1206 e = pam_getenvlist(handle);
1207 if (!e) {
1208 pam_code = PAM_BUF_ERR;
1209 goto fail;
1210 }
1211
1212 /* Block SIGTERM, so that we know that it won't get lost in
1213 * the child */
1214
1215 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1216
1217 parent_pid = getpid_cached();
1218
1219 r = safe_fork("(sd-pam)", 0, &pam_pid);
1220 if (r < 0)
1221 goto fail;
1222 if (r == 0) {
1223 int sig, ret = EXIT_PAM;
1224
1225 /* The child's job is to reset the PAM session on
1226 * termination */
1227 barrier_set_role(&barrier, BARRIER_CHILD);
1228
1229 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1230 * are open here that have been opened by PAM. */
1231 (void) close_many(fds, n_fds);
1232
1233 /* Drop privileges - we don't need any to pam_close_session
1234 * and this will make PR_SET_PDEATHSIG work in most cases.
1235 * If this fails, ignore the error - but expect sd-pam threads
1236 * to fail to exit normally */
1237
1238 r = maybe_setgroups(0, NULL);
1239 if (r < 0)
1240 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1241 if (setresgid(gid, gid, gid) < 0)
1242 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1243 if (setresuid(uid, uid, uid) < 0)
1244 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1245
1246 (void) ignore_signals(SIGPIPE, -1);
1247
1248 /* Wait until our parent died. This will only work if
1249 * the above setresuid() succeeds, otherwise the kernel
1250 * will not allow unprivileged parents kill their privileged
1251 * children this way. We rely on the control groups kill logic
1252 * to do the rest for us. */
1253 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1254 goto child_finish;
1255
1256 /* Tell the parent that our setup is done. This is especially
1257 * important regarding dropping privileges. Otherwise, unit
1258 * setup might race against our setresuid(2) call.
1259 *
1260 * If the parent aborted, we'll detect this below, hence ignore
1261 * return failure here. */
1262 (void) barrier_place(&barrier);
1263
1264 /* Check if our parent process might already have died? */
1265 if (getppid() == parent_pid) {
1266 sigset_t ss;
1267
1268 assert_se(sigemptyset(&ss) >= 0);
1269 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1270
1271 for (;;) {
1272 if (sigwait(&ss, &sig) < 0) {
1273 if (errno == EINTR)
1274 continue;
1275
1276 goto child_finish;
1277 }
1278
1279 assert(sig == SIGTERM);
1280 break;
1281 }
1282 }
1283
1284 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1285 if (pam_code != PAM_SUCCESS)
1286 goto child_finish;
1287
1288 /* If our parent died we'll end the session */
1289 if (getppid() != parent_pid) {
1290 pam_code = pam_close_session(handle, flags);
1291 if (pam_code != PAM_SUCCESS)
1292 goto child_finish;
1293 }
1294
1295 ret = 0;
1296
1297 child_finish:
1298 pam_end(handle, pam_code | flags);
1299 _exit(ret);
1300 }
1301
1302 barrier_set_role(&barrier, BARRIER_PARENT);
1303
1304 /* If the child was forked off successfully it will do all the
1305 * cleanups, so forget about the handle here. */
1306 handle = NULL;
1307
1308 /* Unblock SIGTERM again in the parent */
1309 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1310
1311 /* We close the log explicitly here, since the PAM modules
1312 * might have opened it, but we don't want this fd around. */
1313 closelog();
1314
1315 /* Synchronously wait for the child to initialize. We don't care for
1316 * errors as we cannot recover. However, warn loudly if it happens. */
1317 if (!barrier_place_and_sync(&barrier))
1318 log_error("PAM initialization failed");
1319
1320 return strv_free_and_replace(*env, e);
1321
1322 fail:
1323 if (pam_code != PAM_SUCCESS) {
1324 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1325 r = -EPERM; /* PAM errors do not map to errno */
1326 } else
1327 log_error_errno(r, "PAM failed: %m");
1328
1329 if (handle) {
1330 if (close_session)
1331 pam_code = pam_close_session(handle, flags);
1332
1333 pam_end(handle, pam_code | flags);
1334 }
1335
1336 strv_free(e);
1337 closelog();
1338
1339 return r;
1340 #else
1341 return 0;
1342 #endif
1343 }
1344
1345 static void rename_process_from_path(const char *path) {
1346 char process_name[11];
1347 const char *p;
1348 size_t l;
1349
1350 /* This resulting string must fit in 10 chars (i.e. the length
1351 * of "/sbin/init") to look pretty in /bin/ps */
1352
1353 p = basename(path);
1354 if (isempty(p)) {
1355 rename_process("(...)");
1356 return;
1357 }
1358
1359 l = strlen(p);
1360 if (l > 8) {
1361 /* The end of the process name is usually more
1362 * interesting, since the first bit might just be
1363 * "systemd-" */
1364 p = p + l - 8;
1365 l = 8;
1366 }
1367
1368 process_name[0] = '(';
1369 memcpy(process_name+1, p, l);
1370 process_name[1+l] = ')';
1371 process_name[1+l+1] = 0;
1372
1373 rename_process(process_name);
1374 }
1375
1376 static bool context_has_address_families(const ExecContext *c) {
1377 assert(c);
1378
1379 return c->address_families_whitelist ||
1380 !set_isempty(c->address_families);
1381 }
1382
1383 static bool context_has_syscall_filters(const ExecContext *c) {
1384 assert(c);
1385
1386 return c->syscall_whitelist ||
1387 !hashmap_isempty(c->syscall_filter);
1388 }
1389
1390 static bool context_has_no_new_privileges(const ExecContext *c) {
1391 assert(c);
1392
1393 if (c->no_new_privileges)
1394 return true;
1395
1396 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1397 return false;
1398
1399 /* We need NNP if we have any form of seccomp and are unprivileged */
1400 return context_has_address_families(c) ||
1401 c->memory_deny_write_execute ||
1402 c->restrict_realtime ||
1403 c->restrict_suid_sgid ||
1404 exec_context_restrict_namespaces_set(c) ||
1405 c->protect_clock ||
1406 c->protect_kernel_tunables ||
1407 c->protect_kernel_modules ||
1408 c->protect_kernel_logs ||
1409 c->private_devices ||
1410 context_has_syscall_filters(c) ||
1411 !set_isempty(c->syscall_archs) ||
1412 c->lock_personality ||
1413 c->protect_hostname;
1414 }
1415
1416 #if HAVE_SECCOMP
1417
1418 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1419
1420 if (is_seccomp_available())
1421 return false;
1422
1423 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1424 return true;
1425 }
1426
1427 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1428 uint32_t negative_action, default_action, action;
1429 int r;
1430
1431 assert(u);
1432 assert(c);
1433
1434 if (!context_has_syscall_filters(c))
1435 return 0;
1436
1437 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1438 return 0;
1439
1440 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1441
1442 if (c->syscall_whitelist) {
1443 default_action = negative_action;
1444 action = SCMP_ACT_ALLOW;
1445 } else {
1446 default_action = SCMP_ACT_ALLOW;
1447 action = negative_action;
1448 }
1449
1450 if (needs_ambient_hack) {
1451 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1452 if (r < 0)
1453 return r;
1454 }
1455
1456 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1457 }
1458
1459 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1460 assert(u);
1461 assert(c);
1462
1463 if (set_isempty(c->syscall_archs))
1464 return 0;
1465
1466 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1467 return 0;
1468
1469 return seccomp_restrict_archs(c->syscall_archs);
1470 }
1471
1472 static int apply_address_families(const Unit* u, const ExecContext *c) {
1473 assert(u);
1474 assert(c);
1475
1476 if (!context_has_address_families(c))
1477 return 0;
1478
1479 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1480 return 0;
1481
1482 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1483 }
1484
1485 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1486 assert(u);
1487 assert(c);
1488
1489 if (!c->memory_deny_write_execute)
1490 return 0;
1491
1492 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1493 return 0;
1494
1495 return seccomp_memory_deny_write_execute();
1496 }
1497
1498 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1499 assert(u);
1500 assert(c);
1501
1502 if (!c->restrict_realtime)
1503 return 0;
1504
1505 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1506 return 0;
1507
1508 return seccomp_restrict_realtime();
1509 }
1510
1511 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1512 assert(u);
1513 assert(c);
1514
1515 if (!c->restrict_suid_sgid)
1516 return 0;
1517
1518 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1519 return 0;
1520
1521 return seccomp_restrict_suid_sgid();
1522 }
1523
1524 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1525 assert(u);
1526 assert(c);
1527
1528 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1529 * let's protect even those systems where this is left on in the kernel. */
1530
1531 if (!c->protect_kernel_tunables)
1532 return 0;
1533
1534 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1535 return 0;
1536
1537 return seccomp_protect_sysctl();
1538 }
1539
1540 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1541 assert(u);
1542 assert(c);
1543
1544 /* Turn off module syscalls on ProtectKernelModules=yes */
1545
1546 if (!c->protect_kernel_modules)
1547 return 0;
1548
1549 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1550 return 0;
1551
1552 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1553 }
1554
1555 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1556 assert(u);
1557 assert(c);
1558
1559 if (!c->protect_kernel_logs)
1560 return 0;
1561
1562 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1563 return 0;
1564
1565 return seccomp_protect_syslog();
1566 }
1567
1568 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1569 assert(u);
1570 assert(c);
1571
1572 if (!c->protect_clock)
1573 return 0;
1574
1575 if (skip_seccomp_unavailable(u, "ProtectClock="))
1576 return 0;
1577
1578 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1579 }
1580
1581 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1582 assert(u);
1583 assert(c);
1584
1585 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1586
1587 if (!c->private_devices)
1588 return 0;
1589
1590 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1591 return 0;
1592
1593 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1594 }
1595
1596 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1597 assert(u);
1598 assert(c);
1599
1600 if (!exec_context_restrict_namespaces_set(c))
1601 return 0;
1602
1603 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1604 return 0;
1605
1606 return seccomp_restrict_namespaces(c->restrict_namespaces);
1607 }
1608
1609 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1610 unsigned long personality;
1611 int r;
1612
1613 assert(u);
1614 assert(c);
1615
1616 if (!c->lock_personality)
1617 return 0;
1618
1619 if (skip_seccomp_unavailable(u, "LockPersonality="))
1620 return 0;
1621
1622 personality = c->personality;
1623
1624 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1625 if (personality == PERSONALITY_INVALID) {
1626
1627 r = opinionated_personality(&personality);
1628 if (r < 0)
1629 return r;
1630 }
1631
1632 return seccomp_lock_personality(personality);
1633 }
1634
1635 #endif
1636
1637 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1638 assert(idle_pipe);
1639
1640 idle_pipe[1] = safe_close(idle_pipe[1]);
1641 idle_pipe[2] = safe_close(idle_pipe[2]);
1642
1643 if (idle_pipe[0] >= 0) {
1644 int r;
1645
1646 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1647
1648 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1649 ssize_t n;
1650
1651 /* Signal systemd that we are bored and want to continue. */
1652 n = write(idle_pipe[3], "x", 1);
1653 if (n > 0)
1654 /* Wait for systemd to react to the signal above. */
1655 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1656 }
1657
1658 idle_pipe[0] = safe_close(idle_pipe[0]);
1659
1660 }
1661
1662 idle_pipe[3] = safe_close(idle_pipe[3]);
1663 }
1664
1665 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1666
1667 static int build_environment(
1668 const Unit *u,
1669 const ExecContext *c,
1670 const ExecParameters *p,
1671 size_t n_fds,
1672 const char *home,
1673 const char *username,
1674 const char *shell,
1675 dev_t journal_stream_dev,
1676 ino_t journal_stream_ino,
1677 char ***ret) {
1678
1679 _cleanup_strv_free_ char **our_env = NULL;
1680 ExecDirectoryType t;
1681 size_t n_env = 0;
1682 char *x;
1683
1684 assert(u);
1685 assert(c);
1686 assert(p);
1687 assert(ret);
1688
1689 our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1690 if (!our_env)
1691 return -ENOMEM;
1692
1693 if (n_fds > 0) {
1694 _cleanup_free_ char *joined = NULL;
1695
1696 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1697 return -ENOMEM;
1698 our_env[n_env++] = x;
1699
1700 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1701 return -ENOMEM;
1702 our_env[n_env++] = x;
1703
1704 joined = strv_join(p->fd_names, ":");
1705 if (!joined)
1706 return -ENOMEM;
1707
1708 x = strjoin("LISTEN_FDNAMES=", joined);
1709 if (!x)
1710 return -ENOMEM;
1711 our_env[n_env++] = x;
1712 }
1713
1714 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1715 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1716 return -ENOMEM;
1717 our_env[n_env++] = x;
1718
1719 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1720 return -ENOMEM;
1721 our_env[n_env++] = x;
1722 }
1723
1724 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1725 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1726 * check the database directly. */
1727 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1728 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1729 if (!x)
1730 return -ENOMEM;
1731 our_env[n_env++] = x;
1732 }
1733
1734 if (home) {
1735 x = strjoin("HOME=", home);
1736 if (!x)
1737 return -ENOMEM;
1738
1739 path_simplify(x + 5, true);
1740 our_env[n_env++] = x;
1741 }
1742
1743 if (username) {
1744 x = strjoin("LOGNAME=", username);
1745 if (!x)
1746 return -ENOMEM;
1747 our_env[n_env++] = x;
1748
1749 x = strjoin("USER=", username);
1750 if (!x)
1751 return -ENOMEM;
1752 our_env[n_env++] = x;
1753 }
1754
1755 if (shell) {
1756 x = strjoin("SHELL=", shell);
1757 if (!x)
1758 return -ENOMEM;
1759
1760 path_simplify(x + 6, true);
1761 our_env[n_env++] = x;
1762 }
1763
1764 if (!sd_id128_is_null(u->invocation_id)) {
1765 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1766 return -ENOMEM;
1767
1768 our_env[n_env++] = x;
1769 }
1770
1771 if (exec_context_needs_term(c)) {
1772 const char *tty_path, *term = NULL;
1773
1774 tty_path = exec_context_tty_path(c);
1775
1776 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1777 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1778 * passes to PID 1 ends up all the way in the console login shown. */
1779
1780 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1781 term = getenv("TERM");
1782 if (!term)
1783 term = default_term_for_tty(tty_path);
1784
1785 x = strjoin("TERM=", term);
1786 if (!x)
1787 return -ENOMEM;
1788 our_env[n_env++] = x;
1789 }
1790
1791 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1792 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1793 return -ENOMEM;
1794
1795 our_env[n_env++] = x;
1796 }
1797
1798 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1799 _cleanup_free_ char *pre = NULL, *joined = NULL;
1800 const char *n;
1801
1802 if (!p->prefix[t])
1803 continue;
1804
1805 if (strv_isempty(c->directories[t].paths))
1806 continue;
1807
1808 n = exec_directory_env_name_to_string(t);
1809 if (!n)
1810 continue;
1811
1812 pre = strjoin(p->prefix[t], "/");
1813 if (!pre)
1814 return -ENOMEM;
1815
1816 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1817 if (!joined)
1818 return -ENOMEM;
1819
1820 x = strjoin(n, "=", joined);
1821 if (!x)
1822 return -ENOMEM;
1823
1824 our_env[n_env++] = x;
1825 }
1826
1827 our_env[n_env++] = NULL;
1828 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1829
1830 *ret = TAKE_PTR(our_env);
1831
1832 return 0;
1833 }
1834
1835 static int build_pass_environment(const ExecContext *c, char ***ret) {
1836 _cleanup_strv_free_ char **pass_env = NULL;
1837 size_t n_env = 0, n_bufsize = 0;
1838 char **i;
1839
1840 STRV_FOREACH(i, c->pass_environment) {
1841 _cleanup_free_ char *x = NULL;
1842 char *v;
1843
1844 v = getenv(*i);
1845 if (!v)
1846 continue;
1847 x = strjoin(*i, "=", v);
1848 if (!x)
1849 return -ENOMEM;
1850
1851 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1852 return -ENOMEM;
1853
1854 pass_env[n_env++] = TAKE_PTR(x);
1855 pass_env[n_env] = NULL;
1856 }
1857
1858 *ret = TAKE_PTR(pass_env);
1859
1860 return 0;
1861 }
1862
1863 static bool exec_needs_mount_namespace(
1864 const ExecContext *context,
1865 const ExecParameters *params,
1866 const ExecRuntime *runtime) {
1867
1868 assert(context);
1869 assert(params);
1870
1871 if (context->root_image)
1872 return true;
1873
1874 if (!strv_isempty(context->read_write_paths) ||
1875 !strv_isempty(context->read_only_paths) ||
1876 !strv_isempty(context->inaccessible_paths))
1877 return true;
1878
1879 if (context->n_bind_mounts > 0)
1880 return true;
1881
1882 if (context->n_temporary_filesystems > 0)
1883 return true;
1884
1885 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1886 return true;
1887
1888 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1889 return true;
1890
1891 if (context->private_devices ||
1892 context->private_mounts ||
1893 context->protect_system != PROTECT_SYSTEM_NO ||
1894 context->protect_home != PROTECT_HOME_NO ||
1895 context->protect_kernel_tunables ||
1896 context->protect_kernel_modules ||
1897 context->protect_kernel_logs ||
1898 context->protect_control_groups)
1899 return true;
1900
1901 if (context->root_directory) {
1902 ExecDirectoryType t;
1903
1904 if (context->mount_apivfs)
1905 return true;
1906
1907 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1908 if (!params->prefix[t])
1909 continue;
1910
1911 if (!strv_isempty(context->directories[t].paths))
1912 return true;
1913 }
1914 }
1915
1916 if (context->dynamic_user &&
1917 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1918 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1919 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1920 return true;
1921
1922 return false;
1923 }
1924
1925 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1926 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1927 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1928 _cleanup_close_ int unshare_ready_fd = -1;
1929 _cleanup_(sigkill_waitp) pid_t pid = 0;
1930 uint64_t c = 1;
1931 ssize_t n;
1932 int r;
1933
1934 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1935 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1936 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1937 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1938 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1939 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1940 * continues execution normally.
1941 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
1942 * does not need CAP_SETUID to write the single line mapping to itself. */
1943
1944 /* Can only set up multiple mappings with CAP_SETUID. */
1945 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
1946 r = asprintf(&uid_map,
1947 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
1948 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1949 ouid, ouid, uid, uid);
1950 else
1951 r = asprintf(&uid_map,
1952 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
1953 ouid, ouid);
1954
1955 if (r < 0)
1956 return -ENOMEM;
1957
1958 /* Can only set up multiple mappings with CAP_SETGID. */
1959 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
1960 r = asprintf(&gid_map,
1961 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
1962 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1963 ogid, ogid, gid, gid);
1964 else
1965 r = asprintf(&gid_map,
1966 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
1967 ogid, ogid);
1968
1969 if (r < 0)
1970 return -ENOMEM;
1971
1972 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1973 * namespace. */
1974 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1975 if (unshare_ready_fd < 0)
1976 return -errno;
1977
1978 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1979 * failed. */
1980 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1981 return -errno;
1982
1983 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1984 if (r < 0)
1985 return r;
1986 if (r == 0) {
1987 _cleanup_close_ int fd = -1;
1988 const char *a;
1989 pid_t ppid;
1990
1991 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1992 * here, after the parent opened its own user namespace. */
1993
1994 ppid = getppid();
1995 errno_pipe[0] = safe_close(errno_pipe[0]);
1996
1997 /* Wait until the parent unshared the user namespace */
1998 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1999 r = -errno;
2000 goto child_fail;
2001 }
2002
2003 /* Disable the setgroups() system call in the child user namespace, for good. */
2004 a = procfs_file_alloca(ppid, "setgroups");
2005 fd = open(a, O_WRONLY|O_CLOEXEC);
2006 if (fd < 0) {
2007 if (errno != ENOENT) {
2008 r = -errno;
2009 goto child_fail;
2010 }
2011
2012 /* If the file is missing the kernel is too old, let's continue anyway. */
2013 } else {
2014 if (write(fd, "deny\n", 5) < 0) {
2015 r = -errno;
2016 goto child_fail;
2017 }
2018
2019 fd = safe_close(fd);
2020 }
2021
2022 /* First write the GID map */
2023 a = procfs_file_alloca(ppid, "gid_map");
2024 fd = open(a, O_WRONLY|O_CLOEXEC);
2025 if (fd < 0) {
2026 r = -errno;
2027 goto child_fail;
2028 }
2029 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2030 r = -errno;
2031 goto child_fail;
2032 }
2033 fd = safe_close(fd);
2034
2035 /* The write the UID map */
2036 a = procfs_file_alloca(ppid, "uid_map");
2037 fd = open(a, O_WRONLY|O_CLOEXEC);
2038 if (fd < 0) {
2039 r = -errno;
2040 goto child_fail;
2041 }
2042 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2043 r = -errno;
2044 goto child_fail;
2045 }
2046
2047 _exit(EXIT_SUCCESS);
2048
2049 child_fail:
2050 (void) write(errno_pipe[1], &r, sizeof(r));
2051 _exit(EXIT_FAILURE);
2052 }
2053
2054 errno_pipe[1] = safe_close(errno_pipe[1]);
2055
2056 if (unshare(CLONE_NEWUSER) < 0)
2057 return -errno;
2058
2059 /* Let the child know that the namespace is ready now */
2060 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2061 return -errno;
2062
2063 /* Try to read an error code from the child */
2064 n = read(errno_pipe[0], &r, sizeof(r));
2065 if (n < 0)
2066 return -errno;
2067 if (n == sizeof(r)) { /* an error code was sent to us */
2068 if (r < 0)
2069 return r;
2070 return -EIO;
2071 }
2072 if (n != 0) /* on success we should have read 0 bytes */
2073 return -EIO;
2074
2075 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2076 pid = 0;
2077 if (r < 0)
2078 return r;
2079 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2080 return -EIO;
2081
2082 return 0;
2083 }
2084
2085 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2086 if (!context->dynamic_user)
2087 return false;
2088
2089 if (type == EXEC_DIRECTORY_CONFIGURATION)
2090 return false;
2091
2092 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2093 return false;
2094
2095 return true;
2096 }
2097
2098 static int setup_exec_directory(
2099 const ExecContext *context,
2100 const ExecParameters *params,
2101 uid_t uid,
2102 gid_t gid,
2103 ExecDirectoryType type,
2104 int *exit_status) {
2105
2106 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2107 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2108 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2109 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2110 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2111 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2112 };
2113 char **rt;
2114 int r;
2115
2116 assert(context);
2117 assert(params);
2118 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2119 assert(exit_status);
2120
2121 if (!params->prefix[type])
2122 return 0;
2123
2124 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2125 if (!uid_is_valid(uid))
2126 uid = 0;
2127 if (!gid_is_valid(gid))
2128 gid = 0;
2129 }
2130
2131 STRV_FOREACH(rt, context->directories[type].paths) {
2132 _cleanup_free_ char *p = NULL, *pp = NULL;
2133
2134 p = path_join(params->prefix[type], *rt);
2135 if (!p) {
2136 r = -ENOMEM;
2137 goto fail;
2138 }
2139
2140 r = mkdir_parents_label(p, 0755);
2141 if (r < 0)
2142 goto fail;
2143
2144 if (exec_directory_is_private(context, type)) {
2145 _cleanup_free_ char *private_root = NULL;
2146
2147 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2148 * case we want to avoid leaving a directory around fully accessible that is owned by
2149 * a dynamic user whose UID is later on reused. To lock this down we use the same
2150 * trick used by container managers to prohibit host users to get access to files of
2151 * the same UID in containers: we place everything inside a directory that has an
2152 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2153 * for unprivileged host code. We then use fs namespacing to make this directory
2154 * permeable for the service itself.
2155 *
2156 * Specifically: for a service which wants a special directory "foo/" we first create
2157 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2158 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2159 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2160 * unprivileged host users can't look into it. Inside of the namespace of the unit
2161 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2162 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2163 * for the service and making sure it only gets access to the dirs it needs but no
2164 * others. Tricky? Yes, absolutely, but it works!
2165 *
2166 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2167 * to be owned by the service itself.
2168 *
2169 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2170 * for sharing files or sockets with other services. */
2171
2172 private_root = path_join(params->prefix[type], "private");
2173 if (!private_root) {
2174 r = -ENOMEM;
2175 goto fail;
2176 }
2177
2178 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2179 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2180 if (r < 0)
2181 goto fail;
2182
2183 pp = path_join(private_root, *rt);
2184 if (!pp) {
2185 r = -ENOMEM;
2186 goto fail;
2187 }
2188
2189 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2190 r = mkdir_parents_label(pp, 0755);
2191 if (r < 0)
2192 goto fail;
2193
2194 if (is_dir(p, false) > 0 &&
2195 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2196
2197 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2198 * it over. Most likely the service has been upgraded from one that didn't use
2199 * DynamicUser=1, to one that does. */
2200
2201 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2202 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2203 exec_directory_type_to_string(type), p, pp);
2204
2205 if (rename(p, pp) < 0) {
2206 r = -errno;
2207 goto fail;
2208 }
2209 } else {
2210 /* Otherwise, create the actual directory for the service */
2211
2212 r = mkdir_label(pp, context->directories[type].mode);
2213 if (r < 0 && r != -EEXIST)
2214 goto fail;
2215 }
2216
2217 /* And link it up from the original place */
2218 r = symlink_idempotent(pp, p, true);
2219 if (r < 0)
2220 goto fail;
2221
2222 } else {
2223 _cleanup_free_ char *target = NULL;
2224
2225 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2226 readlink_and_make_absolute(p, &target) >= 0) {
2227 _cleanup_free_ char *q = NULL;
2228
2229 /* This already exists and is a symlink? Interesting. Maybe it's one created
2230 * by DynamicUser=1 (see above)?
2231 *
2232 * We do this for all directory types except for ConfigurationDirectory=,
2233 * since they all support the private/ symlink logic at least in some
2234 * configurations, see above. */
2235
2236 q = path_join(params->prefix[type], "private", *rt);
2237 if (!q) {
2238 r = -ENOMEM;
2239 goto fail;
2240 }
2241
2242 if (path_equal(q, target)) {
2243
2244 /* Hmm, apparently DynamicUser= was once turned on for this service,
2245 * but is no longer. Let's move the directory back up. */
2246
2247 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2248 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2249 exec_directory_type_to_string(type), q, p);
2250
2251 if (unlink(p) < 0) {
2252 r = -errno;
2253 goto fail;
2254 }
2255
2256 if (rename(q, p) < 0) {
2257 r = -errno;
2258 goto fail;
2259 }
2260 }
2261 }
2262
2263 r = mkdir_label(p, context->directories[type].mode);
2264 if (r < 0) {
2265 if (r != -EEXIST)
2266 goto fail;
2267
2268 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2269 struct stat st;
2270
2271 /* Don't change the owner/access mode of the configuration directory,
2272 * as in the common case it is not written to by a service, and shall
2273 * not be writable. */
2274
2275 if (stat(p, &st) < 0) {
2276 r = -errno;
2277 goto fail;
2278 }
2279
2280 /* Still complain if the access mode doesn't match */
2281 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2282 log_warning("%s \'%s\' already exists but the mode is different. "
2283 "(File system: %o %sMode: %o)",
2284 exec_directory_type_to_string(type), *rt,
2285 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2286
2287 continue;
2288 }
2289 }
2290 }
2291
2292 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2293 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2294 * current UID/GID ownership.) */
2295 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2296 if (r < 0)
2297 goto fail;
2298
2299 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2300 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2301 * assignments to exist.*/
2302 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2303 if (r < 0)
2304 goto fail;
2305 }
2306
2307 return 0;
2308
2309 fail:
2310 *exit_status = exit_status_table[type];
2311 return r;
2312 }
2313
2314 #if ENABLE_SMACK
2315 static int setup_smack(
2316 const ExecContext *context,
2317 const ExecCommand *command) {
2318
2319 int r;
2320
2321 assert(context);
2322 assert(command);
2323
2324 if (context->smack_process_label) {
2325 r = mac_smack_apply_pid(0, context->smack_process_label);
2326 if (r < 0)
2327 return r;
2328 }
2329 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2330 else {
2331 _cleanup_free_ char *exec_label = NULL;
2332
2333 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2334 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2335 return r;
2336
2337 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2338 if (r < 0)
2339 return r;
2340 }
2341 #endif
2342
2343 return 0;
2344 }
2345 #endif
2346
2347 static int compile_bind_mounts(
2348 const ExecContext *context,
2349 const ExecParameters *params,
2350 BindMount **ret_bind_mounts,
2351 size_t *ret_n_bind_mounts,
2352 char ***ret_empty_directories) {
2353
2354 _cleanup_strv_free_ char **empty_directories = NULL;
2355 BindMount *bind_mounts;
2356 size_t n, h = 0, i;
2357 ExecDirectoryType t;
2358 int r;
2359
2360 assert(context);
2361 assert(params);
2362 assert(ret_bind_mounts);
2363 assert(ret_n_bind_mounts);
2364 assert(ret_empty_directories);
2365
2366 n = context->n_bind_mounts;
2367 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2368 if (!params->prefix[t])
2369 continue;
2370
2371 n += strv_length(context->directories[t].paths);
2372 }
2373
2374 if (n <= 0) {
2375 *ret_bind_mounts = NULL;
2376 *ret_n_bind_mounts = 0;
2377 *ret_empty_directories = NULL;
2378 return 0;
2379 }
2380
2381 bind_mounts = new(BindMount, n);
2382 if (!bind_mounts)
2383 return -ENOMEM;
2384
2385 for (i = 0; i < context->n_bind_mounts; i++) {
2386 BindMount *item = context->bind_mounts + i;
2387 char *s, *d;
2388
2389 s = strdup(item->source);
2390 if (!s) {
2391 r = -ENOMEM;
2392 goto finish;
2393 }
2394
2395 d = strdup(item->destination);
2396 if (!d) {
2397 free(s);
2398 r = -ENOMEM;
2399 goto finish;
2400 }
2401
2402 bind_mounts[h++] = (BindMount) {
2403 .source = s,
2404 .destination = d,
2405 .read_only = item->read_only,
2406 .recursive = item->recursive,
2407 .ignore_enoent = item->ignore_enoent,
2408 };
2409 }
2410
2411 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2412 char **suffix;
2413
2414 if (!params->prefix[t])
2415 continue;
2416
2417 if (strv_isempty(context->directories[t].paths))
2418 continue;
2419
2420 if (exec_directory_is_private(context, t) &&
2421 !(context->root_directory || context->root_image)) {
2422 char *private_root;
2423
2424 /* So this is for a dynamic user, and we need to make sure the process can access its own
2425 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2426 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2427
2428 private_root = path_join(params->prefix[t], "private");
2429 if (!private_root) {
2430 r = -ENOMEM;
2431 goto finish;
2432 }
2433
2434 r = strv_consume(&empty_directories, private_root);
2435 if (r < 0)
2436 goto finish;
2437 }
2438
2439 STRV_FOREACH(suffix, context->directories[t].paths) {
2440 char *s, *d;
2441
2442 if (exec_directory_is_private(context, t))
2443 s = path_join(params->prefix[t], "private", *suffix);
2444 else
2445 s = path_join(params->prefix[t], *suffix);
2446 if (!s) {
2447 r = -ENOMEM;
2448 goto finish;
2449 }
2450
2451 if (exec_directory_is_private(context, t) &&
2452 (context->root_directory || context->root_image))
2453 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2454 * directory is not created on the root directory. So, let's bind-mount the directory
2455 * on the 'non-private' place. */
2456 d = path_join(params->prefix[t], *suffix);
2457 else
2458 d = strdup(s);
2459 if (!d) {
2460 free(s);
2461 r = -ENOMEM;
2462 goto finish;
2463 }
2464
2465 bind_mounts[h++] = (BindMount) {
2466 .source = s,
2467 .destination = d,
2468 .read_only = false,
2469 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2470 .recursive = true,
2471 .ignore_enoent = false,
2472 };
2473 }
2474 }
2475
2476 assert(h == n);
2477
2478 *ret_bind_mounts = bind_mounts;
2479 *ret_n_bind_mounts = n;
2480 *ret_empty_directories = TAKE_PTR(empty_directories);
2481
2482 return (int) n;
2483
2484 finish:
2485 bind_mount_free_many(bind_mounts, h);
2486 return r;
2487 }
2488
2489 static bool insist_on_sandboxing(
2490 const ExecContext *context,
2491 const char *root_dir,
2492 const char *root_image,
2493 const BindMount *bind_mounts,
2494 size_t n_bind_mounts) {
2495
2496 size_t i;
2497
2498 assert(context);
2499 assert(n_bind_mounts == 0 || bind_mounts);
2500
2501 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2502 * would alter the view on the file system beyond making things read-only or invisble, i.e. would
2503 * rearrange stuff in a way we cannot ignore gracefully. */
2504
2505 if (context->n_temporary_filesystems > 0)
2506 return true;
2507
2508 if (root_dir || root_image)
2509 return true;
2510
2511 if (context->dynamic_user)
2512 return true;
2513
2514 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2515 * essential. */
2516 for (i = 0; i < n_bind_mounts; i++)
2517 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2518 return true;
2519
2520 return false;
2521 }
2522
2523 static int apply_mount_namespace(
2524 const Unit *u,
2525 const ExecCommand *command,
2526 const ExecContext *context,
2527 const ExecParameters *params,
2528 const ExecRuntime *runtime,
2529 char **error_path) {
2530
2531 _cleanup_strv_free_ char **empty_directories = NULL;
2532 char *tmp = NULL, *var = NULL;
2533 const char *root_dir = NULL, *root_image = NULL;
2534 NamespaceInfo ns_info;
2535 bool needs_sandboxing;
2536 BindMount *bind_mounts = NULL;
2537 size_t n_bind_mounts = 0;
2538 int r;
2539
2540 assert(context);
2541
2542 /* The runtime struct only contains the parent of the private /tmp,
2543 * which is non-accessible to world users. Inside of it there's a /tmp
2544 * that is sticky, and that's the one we want to use here. */
2545
2546 if (context->private_tmp && runtime) {
2547 if (runtime->tmp_dir)
2548 tmp = strjoina(runtime->tmp_dir, "/tmp");
2549 if (runtime->var_tmp_dir)
2550 var = strjoina(runtime->var_tmp_dir, "/tmp");
2551 }
2552
2553 if (params->flags & EXEC_APPLY_CHROOT) {
2554 root_image = context->root_image;
2555
2556 if (!root_image)
2557 root_dir = context->root_directory;
2558 }
2559
2560 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2561 if (r < 0)
2562 return r;
2563
2564 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2565 if (needs_sandboxing)
2566 ns_info = (NamespaceInfo) {
2567 .ignore_protect_paths = false,
2568 .private_dev = context->private_devices,
2569 .protect_control_groups = context->protect_control_groups,
2570 .protect_kernel_tunables = context->protect_kernel_tunables,
2571 .protect_kernel_modules = context->protect_kernel_modules,
2572 .protect_kernel_logs = context->protect_kernel_logs,
2573 .protect_hostname = context->protect_hostname,
2574 .mount_apivfs = context->mount_apivfs,
2575 .private_mounts = context->private_mounts,
2576 };
2577 else if (!context->dynamic_user && root_dir)
2578 /*
2579 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2580 * sandbox info, otherwise enforce it, don't ignore protected paths and
2581 * fail if we are enable to apply the sandbox inside the mount namespace.
2582 */
2583 ns_info = (NamespaceInfo) {
2584 .ignore_protect_paths = true,
2585 };
2586 else
2587 ns_info = (NamespaceInfo) {};
2588
2589 if (context->mount_flags == MS_SHARED)
2590 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2591
2592 r = setup_namespace(root_dir, root_image,
2593 &ns_info, context->read_write_paths,
2594 needs_sandboxing ? context->read_only_paths : NULL,
2595 needs_sandboxing ? context->inaccessible_paths : NULL,
2596 empty_directories,
2597 bind_mounts,
2598 n_bind_mounts,
2599 context->temporary_filesystems,
2600 context->n_temporary_filesystems,
2601 tmp,
2602 var,
2603 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2604 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2605 context->mount_flags,
2606 DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2607 error_path);
2608
2609 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2610 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2611 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2612 * completely different execution environment. */
2613 if (r == -ENOANO) {
2614 if (insist_on_sandboxing(
2615 context,
2616 root_dir, root_image,
2617 bind_mounts,
2618 n_bind_mounts)) {
2619 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2620 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2621 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2622
2623 r = -EOPNOTSUPP;
2624 } else {
2625 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2626 r = 0;
2627 }
2628 }
2629
2630 bind_mount_free_many(bind_mounts, n_bind_mounts);
2631 return r;
2632 }
2633
2634 static int apply_working_directory(
2635 const ExecContext *context,
2636 const ExecParameters *params,
2637 const char *home,
2638 int *exit_status) {
2639
2640 const char *d, *wd;
2641
2642 assert(context);
2643 assert(exit_status);
2644
2645 if (context->working_directory_home) {
2646
2647 if (!home) {
2648 *exit_status = EXIT_CHDIR;
2649 return -ENXIO;
2650 }
2651
2652 wd = home;
2653
2654 } else if (context->working_directory)
2655 wd = context->working_directory;
2656 else
2657 wd = "/";
2658
2659 if (params->flags & EXEC_APPLY_CHROOT)
2660 d = wd;
2661 else
2662 d = prefix_roota(context->root_directory, wd);
2663
2664 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2665 *exit_status = EXIT_CHDIR;
2666 return -errno;
2667 }
2668
2669 return 0;
2670 }
2671
2672 static int apply_root_directory(
2673 const ExecContext *context,
2674 const ExecParameters *params,
2675 const bool needs_mount_ns,
2676 int *exit_status) {
2677
2678 assert(context);
2679 assert(exit_status);
2680
2681 if (params->flags & EXEC_APPLY_CHROOT) {
2682 if (!needs_mount_ns && context->root_directory)
2683 if (chroot(context->root_directory) < 0) {
2684 *exit_status = EXIT_CHROOT;
2685 return -errno;
2686 }
2687 }
2688
2689 return 0;
2690 }
2691
2692 static int setup_keyring(
2693 const Unit *u,
2694 const ExecContext *context,
2695 const ExecParameters *p,
2696 uid_t uid, gid_t gid) {
2697
2698 key_serial_t keyring;
2699 int r = 0;
2700 uid_t saved_uid;
2701 gid_t saved_gid;
2702
2703 assert(u);
2704 assert(context);
2705 assert(p);
2706
2707 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2708 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2709 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2710 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2711 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2712 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2713
2714 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2715 return 0;
2716
2717 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2718 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2719 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2720 * & group is just as nasty as acquiring a reference to the user keyring. */
2721
2722 saved_uid = getuid();
2723 saved_gid = getgid();
2724
2725 if (gid_is_valid(gid) && gid != saved_gid) {
2726 if (setregid(gid, -1) < 0)
2727 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2728 }
2729
2730 if (uid_is_valid(uid) && uid != saved_uid) {
2731 if (setreuid(uid, -1) < 0) {
2732 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2733 goto out;
2734 }
2735 }
2736
2737 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2738 if (keyring == -1) {
2739 if (errno == ENOSYS)
2740 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2741 else if (IN_SET(errno, EACCES, EPERM))
2742 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2743 else if (errno == EDQUOT)
2744 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2745 else
2746 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2747
2748 goto out;
2749 }
2750
2751 /* When requested link the user keyring into the session keyring. */
2752 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2753
2754 if (keyctl(KEYCTL_LINK,
2755 KEY_SPEC_USER_KEYRING,
2756 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2757 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2758 goto out;
2759 }
2760 }
2761
2762 /* Restore uid/gid back */
2763 if (uid_is_valid(uid) && uid != saved_uid) {
2764 if (setreuid(saved_uid, -1) < 0) {
2765 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2766 goto out;
2767 }
2768 }
2769
2770 if (gid_is_valid(gid) && gid != saved_gid) {
2771 if (setregid(saved_gid, -1) < 0)
2772 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2773 }
2774
2775 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2776 if (!sd_id128_is_null(u->invocation_id)) {
2777 key_serial_t key;
2778
2779 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2780 if (key == -1)
2781 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2782 else {
2783 if (keyctl(KEYCTL_SETPERM, key,
2784 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2785 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2786 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2787 }
2788 }
2789
2790 out:
2791 /* Revert back uid & gid for the the last time, and exit */
2792 /* no extra logging, as only the first already reported error matters */
2793 if (getuid() != saved_uid)
2794 (void) setreuid(saved_uid, -1);
2795
2796 if (getgid() != saved_gid)
2797 (void) setregid(saved_gid, -1);
2798
2799 return r;
2800 }
2801
2802 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2803 assert(array);
2804 assert(n);
2805 assert(pair);
2806
2807 if (pair[0] >= 0)
2808 array[(*n)++] = pair[0];
2809 if (pair[1] >= 0)
2810 array[(*n)++] = pair[1];
2811 }
2812
2813 static int close_remaining_fds(
2814 const ExecParameters *params,
2815 const ExecRuntime *runtime,
2816 const DynamicCreds *dcreds,
2817 int user_lookup_fd,
2818 int socket_fd,
2819 int exec_fd,
2820 const int *fds, size_t n_fds) {
2821
2822 size_t n_dont_close = 0;
2823 int dont_close[n_fds + 12];
2824
2825 assert(params);
2826
2827 if (params->stdin_fd >= 0)
2828 dont_close[n_dont_close++] = params->stdin_fd;
2829 if (params->stdout_fd >= 0)
2830 dont_close[n_dont_close++] = params->stdout_fd;
2831 if (params->stderr_fd >= 0)
2832 dont_close[n_dont_close++] = params->stderr_fd;
2833
2834 if (socket_fd >= 0)
2835 dont_close[n_dont_close++] = socket_fd;
2836 if (exec_fd >= 0)
2837 dont_close[n_dont_close++] = exec_fd;
2838 if (n_fds > 0) {
2839 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2840 n_dont_close += n_fds;
2841 }
2842
2843 if (runtime)
2844 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2845
2846 if (dcreds) {
2847 if (dcreds->user)
2848 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2849 if (dcreds->group)
2850 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2851 }
2852
2853 if (user_lookup_fd >= 0)
2854 dont_close[n_dont_close++] = user_lookup_fd;
2855
2856 return close_all_fds(dont_close, n_dont_close);
2857 }
2858
2859 static int send_user_lookup(
2860 Unit *unit,
2861 int user_lookup_fd,
2862 uid_t uid,
2863 gid_t gid) {
2864
2865 assert(unit);
2866
2867 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2868 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2869 * specified. */
2870
2871 if (user_lookup_fd < 0)
2872 return 0;
2873
2874 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2875 return 0;
2876
2877 if (writev(user_lookup_fd,
2878 (struct iovec[]) {
2879 IOVEC_INIT(&uid, sizeof(uid)),
2880 IOVEC_INIT(&gid, sizeof(gid)),
2881 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2882 return -errno;
2883
2884 return 0;
2885 }
2886
2887 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2888 int r;
2889
2890 assert(c);
2891 assert(home);
2892 assert(buf);
2893
2894 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2895
2896 if (*home)
2897 return 0;
2898
2899 if (!c->working_directory_home)
2900 return 0;
2901
2902 r = get_home_dir(buf);
2903 if (r < 0)
2904 return r;
2905
2906 *home = *buf;
2907 return 1;
2908 }
2909
2910 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2911 _cleanup_strv_free_ char ** list = NULL;
2912 ExecDirectoryType t;
2913 int r;
2914
2915 assert(c);
2916 assert(p);
2917 assert(ret);
2918
2919 assert(c->dynamic_user);
2920
2921 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2922 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2923 * directories. */
2924
2925 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2926 char **i;
2927
2928 if (t == EXEC_DIRECTORY_CONFIGURATION)
2929 continue;
2930
2931 if (!p->prefix[t])
2932 continue;
2933
2934 STRV_FOREACH(i, c->directories[t].paths) {
2935 char *e;
2936
2937 if (exec_directory_is_private(c, t))
2938 e = path_join(p->prefix[t], "private", *i);
2939 else
2940 e = path_join(p->prefix[t], *i);
2941 if (!e)
2942 return -ENOMEM;
2943
2944 r = strv_consume(&list, e);
2945 if (r < 0)
2946 return r;
2947 }
2948 }
2949
2950 *ret = TAKE_PTR(list);
2951
2952 return 0;
2953 }
2954
2955 static char *exec_command_line(char **argv);
2956
2957 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2958 bool using_subcgroup;
2959 char *p;
2960
2961 assert(params);
2962 assert(ret);
2963
2964 if (!params->cgroup_path)
2965 return -EINVAL;
2966
2967 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2968 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2969 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2970 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2971 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2972 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2973 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2974 * flag, which is only passed for the former statements, not for the latter. */
2975
2976 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2977 if (using_subcgroup)
2978 p = path_join(params->cgroup_path, ".control");
2979 else
2980 p = strdup(params->cgroup_path);
2981 if (!p)
2982 return -ENOMEM;
2983
2984 *ret = p;
2985 return using_subcgroup;
2986 }
2987
2988 static int exec_child(
2989 Unit *unit,
2990 const ExecCommand *command,
2991 const ExecContext *context,
2992 const ExecParameters *params,
2993 ExecRuntime *runtime,
2994 DynamicCreds *dcreds,
2995 int socket_fd,
2996 const int named_iofds[static 3],
2997 int *fds,
2998 size_t n_socket_fds,
2999 size_t n_storage_fds,
3000 char **files_env,
3001 int user_lookup_fd,
3002 int *exit_status) {
3003
3004 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3005 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3006 _cleanup_free_ gid_t *supplementary_gids = NULL;
3007 const char *username = NULL, *groupname = NULL;
3008 _cleanup_free_ char *home_buffer = NULL;
3009 const char *home = NULL, *shell = NULL;
3010 char **final_argv = NULL;
3011 dev_t journal_stream_dev = 0;
3012 ino_t journal_stream_ino = 0;
3013 bool userns_set_up = false;
3014 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3015 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3016 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3017 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3018 #if HAVE_SELINUX
3019 _cleanup_free_ char *mac_selinux_context_net = NULL;
3020 bool use_selinux = false;
3021 #endif
3022 #if ENABLE_SMACK
3023 bool use_smack = false;
3024 #endif
3025 #if HAVE_APPARMOR
3026 bool use_apparmor = false;
3027 #endif
3028 uid_t saved_uid = getuid();
3029 gid_t saved_gid = getgid();
3030 uid_t uid = UID_INVALID;
3031 gid_t gid = GID_INVALID;
3032 size_t n_fds;
3033 ExecDirectoryType dt;
3034 int secure_bits;
3035 _cleanup_free_ gid_t *gids_after_pam = NULL;
3036 int ngids_after_pam = 0;
3037
3038 assert(unit);
3039 assert(command);
3040 assert(context);
3041 assert(params);
3042 assert(exit_status);
3043
3044 rename_process_from_path(command->path);
3045
3046 /* We reset exactly these signals, since they are the
3047 * only ones we set to SIG_IGN in the main daemon. All
3048 * others we leave untouched because we set them to
3049 * SIG_DFL or a valid handler initially, both of which
3050 * will be demoted to SIG_DFL. */
3051 (void) default_signals(SIGNALS_CRASH_HANDLER,
3052 SIGNALS_IGNORE, -1);
3053
3054 if (context->ignore_sigpipe)
3055 (void) ignore_signals(SIGPIPE, -1);
3056
3057 r = reset_signal_mask();
3058 if (r < 0) {
3059 *exit_status = EXIT_SIGNAL_MASK;
3060 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3061 }
3062
3063 if (params->idle_pipe)
3064 do_idle_pipe_dance(params->idle_pipe);
3065
3066 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3067 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3068 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3069 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3070
3071 log_forget_fds();
3072 log_set_open_when_needed(true);
3073
3074 /* In case anything used libc syslog(), close this here, too */
3075 closelog();
3076
3077 n_fds = n_socket_fds + n_storage_fds;
3078 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3079 if (r < 0) {
3080 *exit_status = EXIT_FDS;
3081 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3082 }
3083
3084 if (!context->same_pgrp)
3085 if (setsid() < 0) {
3086 *exit_status = EXIT_SETSID;
3087 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3088 }
3089
3090 exec_context_tty_reset(context, params);
3091
3092 if (unit_shall_confirm_spawn(unit)) {
3093 const char *vc = params->confirm_spawn;
3094 _cleanup_free_ char *cmdline = NULL;
3095
3096 cmdline = exec_command_line(command->argv);
3097 if (!cmdline) {
3098 *exit_status = EXIT_MEMORY;
3099 return log_oom();
3100 }
3101
3102 r = ask_for_confirmation(vc, unit, cmdline);
3103 if (r != CONFIRM_EXECUTE) {
3104 if (r == CONFIRM_PRETEND_SUCCESS) {
3105 *exit_status = EXIT_SUCCESS;
3106 return 0;
3107 }
3108 *exit_status = EXIT_CONFIRM;
3109 log_unit_error(unit, "Execution cancelled by the user");
3110 return -ECANCELED;
3111 }
3112 }
3113
3114 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3115 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3116 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3117 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3118 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3119 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3120 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3121 *exit_status = EXIT_MEMORY;
3122 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3123 }
3124
3125 if (context->dynamic_user && dcreds) {
3126 _cleanup_strv_free_ char **suggested_paths = NULL;
3127
3128 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3129 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3130 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3131 *exit_status = EXIT_USER;
3132 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3133 }
3134
3135 r = compile_suggested_paths(context, params, &suggested_paths);
3136 if (r < 0) {
3137 *exit_status = EXIT_MEMORY;
3138 return log_oom();
3139 }
3140
3141 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3142 if (r < 0) {
3143 *exit_status = EXIT_USER;
3144 if (r == -EILSEQ) {
3145 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3146 return -EOPNOTSUPP;
3147 }
3148 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3149 }
3150
3151 if (!uid_is_valid(uid)) {
3152 *exit_status = EXIT_USER;
3153 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3154 return -ESRCH;
3155 }
3156
3157 if (!gid_is_valid(gid)) {
3158 *exit_status = EXIT_USER;
3159 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3160 return -ESRCH;
3161 }
3162
3163 if (dcreds->user)
3164 username = dcreds->user->name;
3165
3166 } else {
3167 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3168 if (r < 0) {
3169 *exit_status = EXIT_USER;
3170 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3171 }
3172
3173 r = get_fixed_group(context, &groupname, &gid);
3174 if (r < 0) {
3175 *exit_status = EXIT_GROUP;
3176 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3177 }
3178 }
3179
3180 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3181 r = get_supplementary_groups(context, username, groupname, gid,
3182 &supplementary_gids, &ngids);
3183 if (r < 0) {
3184 *exit_status = EXIT_GROUP;
3185 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3186 }
3187
3188 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3189 if (r < 0) {
3190 *exit_status = EXIT_USER;
3191 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3192 }
3193
3194 user_lookup_fd = safe_close(user_lookup_fd);
3195
3196 r = acquire_home(context, uid, &home, &home_buffer);
3197 if (r < 0) {
3198 *exit_status = EXIT_CHDIR;
3199 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3200 }
3201
3202 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3203 * must sure to drop O_NONBLOCK */
3204 if (socket_fd >= 0)
3205 (void) fd_nonblock(socket_fd, false);
3206
3207 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3208 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3209 if (params->cgroup_path) {
3210 _cleanup_free_ char *p = NULL;
3211
3212 r = exec_parameters_get_cgroup_path(params, &p);
3213 if (r < 0) {
3214 *exit_status = EXIT_CGROUP;
3215 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3216 }
3217
3218 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3219 if (r < 0) {
3220 *exit_status = EXIT_CGROUP;
3221 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3222 }
3223 }
3224
3225 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3226 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3227 if (r < 0) {
3228 *exit_status = EXIT_NETWORK;
3229 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3230 }
3231 }
3232
3233 r = setup_input(context, params, socket_fd, named_iofds);
3234 if (r < 0) {
3235 *exit_status = EXIT_STDIN;
3236 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3237 }
3238
3239 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3240 if (r < 0) {
3241 *exit_status = EXIT_STDOUT;
3242 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3243 }
3244
3245 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3246 if (r < 0) {
3247 *exit_status = EXIT_STDERR;
3248 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3249 }
3250
3251 if (context->oom_score_adjust_set) {
3252 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3253 * prohibit write access to this file, and we shouldn't trip up over that. */
3254 r = set_oom_score_adjust(context->oom_score_adjust);
3255 if (IN_SET(r, -EPERM, -EACCES))
3256 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3257 else if (r < 0) {
3258 *exit_status = EXIT_OOM_ADJUST;
3259 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3260 }
3261 }
3262
3263 if (context->nice_set) {
3264 r = setpriority_closest(context->nice);
3265 if (r < 0)
3266 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3267 }
3268
3269 if (context->cpu_sched_set) {
3270 struct sched_param param = {
3271 .sched_priority = context->cpu_sched_priority,
3272 };
3273
3274 r = sched_setscheduler(0,
3275 context->cpu_sched_policy |
3276 (context->cpu_sched_reset_on_fork ?
3277 SCHED_RESET_ON_FORK : 0),
3278 &param);
3279 if (r < 0) {
3280 *exit_status = EXIT_SETSCHEDULER;
3281 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3282 }
3283 }
3284
3285 if (context->cpu_set.set)
3286 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3287 *exit_status = EXIT_CPUAFFINITY;
3288 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3289 }
3290
3291 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3292 r = apply_numa_policy(&context->numa_policy);
3293 if (r == -EOPNOTSUPP)
3294 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3295 else if (r < 0) {
3296 *exit_status = EXIT_NUMA_POLICY;
3297 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3298 }
3299 }
3300
3301 if (context->ioprio_set)
3302 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3303 *exit_status = EXIT_IOPRIO;
3304 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3305 }
3306
3307 if (context->timer_slack_nsec != NSEC_INFINITY)
3308 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3309 *exit_status = EXIT_TIMERSLACK;
3310 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3311 }
3312
3313 if (context->personality != PERSONALITY_INVALID) {
3314 r = safe_personality(context->personality);
3315 if (r < 0) {
3316 *exit_status = EXIT_PERSONALITY;
3317 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3318 }
3319 }
3320
3321 if (context->utmp_id)
3322 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3323 context->tty_path,
3324 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3325 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3326 USER_PROCESS,
3327 username);
3328
3329 if (uid_is_valid(uid)) {
3330 r = chown_terminal(STDIN_FILENO, uid);
3331 if (r < 0) {
3332 *exit_status = EXIT_STDIN;
3333 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3334 }
3335 }
3336
3337 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3338 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3339 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3340 * touch a single hierarchy too. */
3341 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3342 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3343 if (r < 0) {
3344 *exit_status = EXIT_CGROUP;
3345 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3346 }
3347 }
3348
3349 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3350 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3351 if (r < 0)
3352 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3353 }
3354
3355 r = build_environment(
3356 unit,
3357 context,
3358 params,
3359 n_fds,
3360 home,
3361 username,
3362 shell,
3363 journal_stream_dev,
3364 journal_stream_ino,
3365 &our_env);
3366 if (r < 0) {
3367 *exit_status = EXIT_MEMORY;
3368 return log_oom();
3369 }
3370
3371 r = build_pass_environment(context, &pass_env);
3372 if (r < 0) {
3373 *exit_status = EXIT_MEMORY;
3374 return log_oom();
3375 }
3376
3377 accum_env = strv_env_merge(5,
3378 params->environment,
3379 our_env,
3380 pass_env,
3381 context->environment,
3382 files_env,
3383 NULL);
3384 if (!accum_env) {
3385 *exit_status = EXIT_MEMORY;
3386 return log_oom();
3387 }
3388 accum_env = strv_env_clean(accum_env);
3389
3390 (void) umask(context->umask);
3391
3392 r = setup_keyring(unit, context, params, uid, gid);
3393 if (r < 0) {
3394 *exit_status = EXIT_KEYRING;
3395 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3396 }
3397
3398 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3399 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3400
3401 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3402 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3403
3404 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3405 if (needs_ambient_hack)
3406 needs_setuid = false;
3407 else
3408 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3409
3410 if (needs_sandboxing) {
3411 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3412 * present. The actual MAC context application will happen later, as late as possible, to avoid
3413 * impacting our own code paths. */
3414
3415 #if HAVE_SELINUX
3416 use_selinux = mac_selinux_use();
3417 #endif
3418 #if ENABLE_SMACK
3419 use_smack = mac_smack_use();
3420 #endif
3421 #if HAVE_APPARMOR
3422 use_apparmor = mac_apparmor_use();
3423 #endif
3424 }
3425
3426 if (needs_sandboxing) {
3427 int which_failed;
3428
3429 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3430 * is set here. (See below.) */
3431
3432 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3433 if (r < 0) {
3434 *exit_status = EXIT_LIMITS;
3435 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3436 }
3437 }
3438
3439 if (needs_setuid) {
3440
3441 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3442 * wins here. (See above.) */
3443
3444 if (context->pam_name && username) {
3445 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3446 if (r < 0) {
3447 *exit_status = EXIT_PAM;
3448 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3449 }
3450
3451 ngids_after_pam = getgroups_alloc(&gids_after_pam);
3452 if (ngids_after_pam < 0) {
3453 *exit_status = EXIT_MEMORY;
3454 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3455 }
3456 }
3457 }
3458
3459 if (needs_sandboxing) {
3460 #if HAVE_SELINUX
3461 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3462 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3463 if (r < 0) {
3464 *exit_status = EXIT_SELINUX_CONTEXT;
3465 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3466 }
3467 }
3468 #endif
3469
3470 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3471 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3472 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3473 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3474 userns_set_up = true;
3475 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3476 if (r < 0) {
3477 *exit_status = EXIT_USER;
3478 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3479 }
3480 }
3481 }
3482
3483 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3484
3485 if (ns_type_supported(NAMESPACE_NET)) {
3486 r = setup_netns(runtime->netns_storage_socket);
3487 if (r < 0) {
3488 *exit_status = EXIT_NETWORK;
3489 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3490 }
3491 } else if (context->network_namespace_path) {
3492 *exit_status = EXIT_NETWORK;
3493 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3494 } else
3495 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3496 }
3497
3498 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3499 if (needs_mount_namespace) {
3500 _cleanup_free_ char *error_path = NULL;
3501
3502 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3503 if (r < 0) {
3504 *exit_status = EXIT_NAMESPACE;
3505 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3506 error_path ? ": " : "", strempty(error_path));
3507 }
3508 }
3509
3510 if (context->protect_hostname) {
3511 if (ns_type_supported(NAMESPACE_UTS)) {
3512 if (unshare(CLONE_NEWUTS) < 0) {
3513 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
3514 *exit_status = EXIT_NAMESPACE;
3515 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3516 }
3517
3518 log_unit_warning(unit, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
3519 }
3520 } else
3521 log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3522 #if HAVE_SECCOMP
3523 r = seccomp_protect_hostname();
3524 if (r < 0) {
3525 *exit_status = EXIT_SECCOMP;
3526 return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3527 }
3528 #endif
3529 }
3530
3531 /* Drop groups as early as possible.
3532 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3533 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3534 if (needs_setuid) {
3535 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3536 int ngids_to_enforce = 0;
3537
3538 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3539 ngids,
3540 gids_after_pam,
3541 ngids_after_pam,
3542 &gids_to_enforce);
3543 if (ngids_to_enforce < 0) {
3544 *exit_status = EXIT_MEMORY;
3545 return log_unit_error_errno(unit,
3546 ngids_to_enforce,
3547 "Failed to merge group lists. Group membership might be incorrect: %m");
3548 }
3549
3550 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3551 if (r < 0) {
3552 *exit_status = EXIT_GROUP;
3553 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3554 }
3555 }
3556
3557 /* If the user namespace was not set up above, try to do it now.
3558 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3559 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3560 * case of mount namespaces being less privileged when the mount point list is copied from a
3561 * different user namespace). */
3562
3563 if (needs_sandboxing && context->private_users && !userns_set_up) {
3564 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3565 if (r < 0) {
3566 *exit_status = EXIT_USER;
3567 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3568 }
3569 }
3570
3571 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3572 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3573 * however if we have it as we want to keep it open until the final execve(). */
3574
3575 if (params->exec_fd >= 0) {
3576 exec_fd = params->exec_fd;
3577
3578 if (exec_fd < 3 + (int) n_fds) {
3579 int moved_fd;
3580
3581 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3582 * process we are about to execute. */
3583
3584 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3585 if (moved_fd < 0) {
3586 *exit_status = EXIT_FDS;
3587 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3588 }
3589
3590 safe_close(exec_fd);
3591 exec_fd = moved_fd;
3592 } else {
3593 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3594 r = fd_cloexec(exec_fd, true);
3595 if (r < 0) {
3596 *exit_status = EXIT_FDS;
3597 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3598 }
3599 }
3600
3601 fds_with_exec_fd = newa(int, n_fds + 1);
3602 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3603 fds_with_exec_fd[n_fds] = exec_fd;
3604 n_fds_with_exec_fd = n_fds + 1;
3605 } else {
3606 fds_with_exec_fd = fds;
3607 n_fds_with_exec_fd = n_fds;
3608 }
3609
3610 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3611 if (r >= 0)
3612 r = shift_fds(fds, n_fds);
3613 if (r >= 0)
3614 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3615 if (r < 0) {
3616 *exit_status = EXIT_FDS;
3617 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3618 }
3619
3620 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3621 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3622 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3623 * came this far. */
3624
3625 secure_bits = context->secure_bits;
3626
3627 if (needs_sandboxing) {
3628 uint64_t bset;
3629
3630 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3631 * requested. (Note this is placed after the general resource limit initialization, see
3632 * above, in order to take precedence.) */
3633 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3634 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3635 *exit_status = EXIT_LIMITS;
3636 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3637 }
3638 }
3639
3640 #if ENABLE_SMACK
3641 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3642 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3643 if (use_smack) {
3644 r = setup_smack(context, command);
3645 if (r < 0) {
3646 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3647 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3648 }
3649 }
3650 #endif
3651
3652 bset = context->capability_bounding_set;
3653 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3654 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3655 * instead of us doing that */
3656 if (needs_ambient_hack)
3657 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3658 (UINT64_C(1) << CAP_SETUID) |
3659 (UINT64_C(1) << CAP_SETGID);
3660
3661 if (!cap_test_all(bset)) {
3662 r = capability_bounding_set_drop(bset, false);
3663 if (r < 0) {
3664 *exit_status = EXIT_CAPABILITIES;
3665 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3666 }
3667 }
3668
3669 /* This is done before enforce_user, but ambient set
3670 * does not survive over setresuid() if keep_caps is not set. */
3671 if (!needs_ambient_hack) {
3672 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3673 if (r < 0) {
3674 *exit_status = EXIT_CAPABILITIES;
3675 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3676 }
3677 }
3678 }
3679
3680 /* chroot to root directory first, before we lose the ability to chroot */
3681 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3682 if (r < 0)
3683 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3684
3685 if (needs_setuid) {
3686 if (uid_is_valid(uid)) {
3687 r = enforce_user(context, uid);
3688 if (r < 0) {
3689 *exit_status = EXIT_USER;
3690 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3691 }
3692
3693 if (!needs_ambient_hack &&
3694 context->capability_ambient_set != 0) {
3695
3696 /* Fix the ambient capabilities after user change. */
3697 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3698 if (r < 0) {
3699 *exit_status = EXIT_CAPABILITIES;
3700 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3701 }
3702
3703 /* If we were asked to change user and ambient capabilities
3704 * were requested, we had to add keep-caps to the securebits
3705 * so that we would maintain the inherited capability set
3706 * through the setresuid(). Make sure that the bit is added
3707 * also to the context secure_bits so that we don't try to
3708 * drop the bit away next. */
3709
3710 secure_bits |= 1<<SECURE_KEEP_CAPS;
3711 }
3712 }
3713 }
3714
3715 /* Apply working directory here, because the working directory might be on NFS and only the user running
3716 * this service might have the correct privilege to change to the working directory */
3717 r = apply_working_directory(context, params, home, exit_status);
3718 if (r < 0)
3719 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3720
3721 if (needs_sandboxing) {
3722 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3723 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3724 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3725 * are restricted. */
3726
3727 #if HAVE_SELINUX
3728 if (use_selinux) {
3729 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3730
3731 if (exec_context) {
3732 r = setexeccon(exec_context);
3733 if (r < 0) {
3734 *exit_status = EXIT_SELINUX_CONTEXT;
3735 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3736 }
3737 }
3738 }
3739 #endif
3740
3741 #if HAVE_APPARMOR
3742 if (use_apparmor && context->apparmor_profile) {
3743 r = aa_change_onexec(context->apparmor_profile);
3744 if (r < 0 && !context->apparmor_profile_ignore) {
3745 *exit_status = EXIT_APPARMOR_PROFILE;
3746 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3747 }
3748 }
3749 #endif
3750
3751 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3752 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3753 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3754 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3755 *exit_status = EXIT_SECUREBITS;
3756 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3757 }
3758
3759 if (context_has_no_new_privileges(context))
3760 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3761 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3762 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3763 }
3764
3765 #if HAVE_SECCOMP
3766 r = apply_address_families(unit, context);
3767 if (r < 0) {
3768 *exit_status = EXIT_ADDRESS_FAMILIES;
3769 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3770 }
3771
3772 r = apply_memory_deny_write_execute(unit, context);
3773 if (r < 0) {
3774 *exit_status = EXIT_SECCOMP;
3775 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3776 }
3777
3778 r = apply_restrict_realtime(unit, context);
3779 if (r < 0) {
3780 *exit_status = EXIT_SECCOMP;
3781 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3782 }
3783
3784 r = apply_restrict_suid_sgid(unit, context);
3785 if (r < 0) {
3786 *exit_status = EXIT_SECCOMP;
3787 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3788 }
3789
3790 r = apply_restrict_namespaces(unit, context);
3791 if (r < 0) {
3792 *exit_status = EXIT_SECCOMP;
3793 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3794 }
3795
3796 r = apply_protect_sysctl(unit, context);
3797 if (r < 0) {
3798 *exit_status = EXIT_SECCOMP;
3799 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3800 }
3801
3802 r = apply_protect_kernel_modules(unit, context);
3803 if (r < 0) {
3804 *exit_status = EXIT_SECCOMP;
3805 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3806 }
3807
3808 r = apply_protect_kernel_logs(unit, context);
3809 if (r < 0) {
3810 *exit_status = EXIT_SECCOMP;
3811 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3812 }
3813
3814 r = apply_protect_clock(unit, context);
3815 if (r < 0) {
3816 *exit_status = EXIT_SECCOMP;
3817 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3818 }
3819
3820 r = apply_private_devices(unit, context);
3821 if (r < 0) {
3822 *exit_status = EXIT_SECCOMP;
3823 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3824 }
3825
3826 r = apply_syscall_archs(unit, context);
3827 if (r < 0) {
3828 *exit_status = EXIT_SECCOMP;
3829 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3830 }
3831
3832 r = apply_lock_personality(unit, context);
3833 if (r < 0) {
3834 *exit_status = EXIT_SECCOMP;
3835 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3836 }
3837
3838 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3839 * by the filter as little as possible. */
3840 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3841 if (r < 0) {
3842 *exit_status = EXIT_SECCOMP;
3843 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3844 }
3845 #endif
3846 }
3847
3848 if (!strv_isempty(context->unset_environment)) {
3849 char **ee = NULL;
3850
3851 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3852 if (!ee) {
3853 *exit_status = EXIT_MEMORY;
3854 return log_oom();
3855 }
3856
3857 strv_free_and_replace(accum_env, ee);
3858 }
3859
3860 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3861 replaced_argv = replace_env_argv(command->argv, accum_env);
3862 if (!replaced_argv) {
3863 *exit_status = EXIT_MEMORY;
3864 return log_oom();
3865 }
3866 final_argv = replaced_argv;
3867 } else
3868 final_argv = command->argv;
3869
3870 if (DEBUG_LOGGING) {
3871 _cleanup_free_ char *line;
3872
3873 line = exec_command_line(final_argv);
3874 if (line)
3875 log_struct(LOG_DEBUG,
3876 "EXECUTABLE=%s", command->path,
3877 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3878 LOG_UNIT_ID(unit),
3879 LOG_UNIT_INVOCATION_ID(unit));
3880 }
3881
3882 if (exec_fd >= 0) {
3883 uint8_t hot = 1;
3884
3885 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3886 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3887
3888 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3889 *exit_status = EXIT_EXEC;
3890 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3891 }
3892 }
3893
3894 execve(command->path, final_argv, accum_env);
3895 r = -errno;
3896
3897 if (exec_fd >= 0) {
3898 uint8_t hot = 0;
3899
3900 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3901 * that POLLHUP on it no longer means execve() succeeded. */
3902
3903 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3904 *exit_status = EXIT_EXEC;
3905 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3906 }
3907 }
3908
3909 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3910 log_struct_errno(LOG_INFO, r,
3911 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3912 LOG_UNIT_ID(unit),
3913 LOG_UNIT_INVOCATION_ID(unit),
3914 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3915 command->path),
3916 "EXECUTABLE=%s", command->path);
3917 return 0;
3918 }
3919
3920 *exit_status = EXIT_EXEC;
3921 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3922 }
3923
3924 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3925 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3926
3927 int exec_spawn(Unit *unit,
3928 ExecCommand *command,
3929 const ExecContext *context,
3930 const ExecParameters *params,
3931 ExecRuntime *runtime,
3932 DynamicCreds *dcreds,
3933 pid_t *ret) {
3934
3935 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3936 _cleanup_free_ char *subcgroup_path = NULL;
3937 _cleanup_strv_free_ char **files_env = NULL;
3938 size_t n_storage_fds = 0, n_socket_fds = 0;
3939 _cleanup_free_ char *line = NULL;
3940 pid_t pid;
3941
3942 assert(unit);
3943 assert(command);
3944 assert(context);
3945 assert(ret);
3946 assert(params);
3947 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3948
3949 if (context->std_input == EXEC_INPUT_SOCKET ||
3950 context->std_output == EXEC_OUTPUT_SOCKET ||
3951 context->std_error == EXEC_OUTPUT_SOCKET) {
3952
3953 if (params->n_socket_fds > 1) {
3954 log_unit_error(unit, "Got more than one socket.");
3955 return -EINVAL;
3956 }
3957
3958 if (params->n_socket_fds == 0) {
3959 log_unit_error(unit, "Got no socket.");
3960 return -EINVAL;
3961 }
3962
3963 socket_fd = params->fds[0];
3964 } else {
3965 socket_fd = -1;
3966 fds = params->fds;
3967 n_socket_fds = params->n_socket_fds;
3968 n_storage_fds = params->n_storage_fds;
3969 }
3970
3971 r = exec_context_named_iofds(context, params, named_iofds);
3972 if (r < 0)
3973 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3974
3975 r = exec_context_load_environment(unit, context, &files_env);
3976 if (r < 0)
3977 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3978
3979 line = exec_command_line(command->argv);
3980 if (!line)
3981 return log_oom();
3982
3983 log_struct(LOG_DEBUG,
3984 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3985 "EXECUTABLE=%s", command->path,
3986 LOG_UNIT_ID(unit),
3987 LOG_UNIT_INVOCATION_ID(unit));
3988
3989 if (params->cgroup_path) {
3990 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3991 if (r < 0)
3992 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3993 if (r > 0) { /* We are using a child cgroup */
3994 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3995 if (r < 0)
3996 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3997 }
3998 }
3999
4000 pid = fork();
4001 if (pid < 0)
4002 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4003
4004 if (pid == 0) {
4005 int exit_status = EXIT_SUCCESS;
4006
4007 r = exec_child(unit,
4008 command,
4009 context,
4010 params,
4011 runtime,
4012 dcreds,
4013 socket_fd,
4014 named_iofds,
4015 fds,
4016 n_socket_fds,
4017 n_storage_fds,
4018 files_env,
4019 unit->manager->user_lookup_fds[1],
4020 &exit_status);
4021
4022 if (r < 0) {
4023 const char *status =
4024 exit_status_to_string(exit_status,
4025 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4026
4027 log_struct_errno(LOG_ERR, r,
4028 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4029 LOG_UNIT_ID(unit),
4030 LOG_UNIT_INVOCATION_ID(unit),
4031 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4032 status, command->path),
4033 "EXECUTABLE=%s", command->path);
4034 }
4035
4036 _exit(exit_status);
4037 }
4038
4039 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4040
4041 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4042 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4043 * process will be killed too). */
4044 if (subcgroup_path)
4045 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4046
4047 exec_status_start(&command->exec_status, pid);
4048
4049 *ret = pid;
4050 return 0;
4051 }
4052
4053 void exec_context_init(ExecContext *c) {
4054 ExecDirectoryType i;
4055
4056 assert(c);
4057
4058 c->umask = 0022;
4059 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4060 c->cpu_sched_policy = SCHED_OTHER;
4061 c->syslog_priority = LOG_DAEMON|LOG_INFO;
4062 c->syslog_level_prefix = true;
4063 c->ignore_sigpipe = true;
4064 c->timer_slack_nsec = NSEC_INFINITY;
4065 c->personality = PERSONALITY_INVALID;
4066 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4067 c->directories[i].mode = 0755;
4068 c->timeout_clean_usec = USEC_INFINITY;
4069 c->capability_bounding_set = CAP_ALL;
4070 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4071 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4072 c->log_level_max = -1;
4073 numa_policy_reset(&c->numa_policy);
4074 }
4075
4076 void exec_context_done(ExecContext *c) {
4077 ExecDirectoryType i;
4078 size_t l;
4079
4080 assert(c);
4081
4082 c->environment = strv_free(c->environment);
4083 c->environment_files = strv_free(c->environment_files);
4084 c->pass_environment = strv_free(c->pass_environment);
4085 c->unset_environment = strv_free(c->unset_environment);
4086
4087 rlimit_free_all(c->rlimit);
4088
4089 for (l = 0; l < 3; l++) {
4090 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4091 c->stdio_file[l] = mfree(c->stdio_file[l]);
4092 }
4093
4094 c->working_directory = mfree(c->working_directory);
4095 c->root_directory = mfree(c->root_directory);
4096 c->root_image = mfree(c->root_image);
4097 c->tty_path = mfree(c->tty_path);
4098 c->syslog_identifier = mfree(c->syslog_identifier);
4099 c->user = mfree(c->user);
4100 c->group = mfree(c->group);
4101
4102 c->supplementary_groups = strv_free(c->supplementary_groups);
4103
4104 c->pam_name = mfree(c->pam_name);
4105
4106 c->read_only_paths = strv_free(c->read_only_paths);
4107 c->read_write_paths = strv_free(c->read_write_paths);
4108 c->inaccessible_paths = strv_free(c->inaccessible_paths);
4109
4110 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4111 c->bind_mounts = NULL;
4112 c->n_bind_mounts = 0;
4113 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4114 c->temporary_filesystems = NULL;
4115 c->n_temporary_filesystems = 0;
4116
4117 cpu_set_reset(&c->cpu_set);
4118 numa_policy_reset(&c->numa_policy);
4119
4120 c->utmp_id = mfree(c->utmp_id);
4121 c->selinux_context = mfree(c->selinux_context);
4122 c->apparmor_profile = mfree(c->apparmor_profile);
4123 c->smack_process_label = mfree(c->smack_process_label);
4124
4125 c->syscall_filter = hashmap_free(c->syscall_filter);
4126 c->syscall_archs = set_free(c->syscall_archs);
4127 c->address_families = set_free(c->address_families);
4128
4129 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4130 c->directories[i].paths = strv_free(c->directories[i].paths);
4131
4132 c->log_level_max = -1;
4133
4134 exec_context_free_log_extra_fields(c);
4135
4136 c->log_ratelimit_interval_usec = 0;
4137 c->log_ratelimit_burst = 0;
4138
4139 c->stdin_data = mfree(c->stdin_data);
4140 c->stdin_data_size = 0;
4141
4142 c->network_namespace_path = mfree(c->network_namespace_path);
4143 }
4144
4145 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4146 char **i;
4147
4148 assert(c);
4149
4150 if (!runtime_prefix)
4151 return 0;
4152
4153 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4154 _cleanup_free_ char *p;
4155
4156 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4157 p = path_join(runtime_prefix, "private", *i);
4158 else
4159 p = path_join(runtime_prefix, *i);
4160 if (!p)
4161 return -ENOMEM;
4162
4163 /* We execute this synchronously, since we need to be sure this is gone when we start the
4164 * service next. */
4165 (void) rm_rf(p, REMOVE_ROOT);
4166 }
4167
4168 return 0;
4169 }
4170
4171 static void exec_command_done(ExecCommand *c) {
4172 assert(c);
4173
4174 c->path = mfree(c->path);
4175 c->argv = strv_free(c->argv);
4176 }
4177
4178 void exec_command_done_array(ExecCommand *c, size_t n) {
4179 size_t i;
4180
4181 for (i = 0; i < n; i++)
4182 exec_command_done(c+i);
4183 }
4184
4185 ExecCommand* exec_command_free_list(ExecCommand *c) {
4186 ExecCommand *i;
4187
4188 while ((i = c)) {
4189 LIST_REMOVE(command, c, i);
4190 exec_command_done(i);
4191 free(i);
4192 }
4193
4194 return NULL;
4195 }
4196
4197 void exec_command_free_array(ExecCommand **c, size_t n) {
4198 size_t i;
4199
4200 for (i = 0; i < n; i++)
4201 c[i] = exec_command_free_list(c[i]);
4202 }
4203
4204 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4205 size_t i;
4206
4207 for (i = 0; i < n; i++)
4208 exec_status_reset(&c[i].exec_status);
4209 }
4210
4211 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4212 size_t i;
4213
4214 for (i = 0; i < n; i++) {
4215 ExecCommand *z;
4216
4217 LIST_FOREACH(command, z, c[i])
4218 exec_status_reset(&z->exec_status);
4219 }
4220 }
4221
4222 typedef struct InvalidEnvInfo {
4223 const Unit *unit;
4224 const char *path;
4225 } InvalidEnvInfo;
4226
4227 static void invalid_env(const char *p, void *userdata) {
4228 InvalidEnvInfo *info = userdata;
4229
4230 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4231 }
4232
4233 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4234 assert(c);
4235
4236 switch (fd_index) {
4237
4238 case STDIN_FILENO:
4239 if (c->std_input != EXEC_INPUT_NAMED_FD)
4240 return NULL;
4241
4242 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4243
4244 case STDOUT_FILENO:
4245 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4246 return NULL;
4247
4248 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4249
4250 case STDERR_FILENO:
4251 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4252 return NULL;
4253
4254 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4255
4256 default:
4257 return NULL;
4258 }
4259 }
4260
4261 static int exec_context_named_iofds(
4262 const ExecContext *c,
4263 const ExecParameters *p,
4264 int named_iofds[static 3]) {
4265
4266 size_t i, targets;
4267 const char* stdio_fdname[3];
4268 size_t n_fds;
4269
4270 assert(c);
4271 assert(p);
4272 assert(named_iofds);
4273
4274 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4275 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4276 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4277
4278 for (i = 0; i < 3; i++)
4279 stdio_fdname[i] = exec_context_fdname(c, i);
4280
4281 n_fds = p->n_storage_fds + p->n_socket_fds;
4282
4283 for (i = 0; i < n_fds && targets > 0; i++)
4284 if (named_iofds[STDIN_FILENO] < 0 &&
4285 c->std_input == EXEC_INPUT_NAMED_FD &&
4286 stdio_fdname[STDIN_FILENO] &&
4287 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4288
4289 named_iofds[STDIN_FILENO] = p->fds[i];
4290 targets--;
4291
4292 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4293 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4294 stdio_fdname[STDOUT_FILENO] &&
4295 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4296
4297 named_iofds[STDOUT_FILENO] = p->fds[i];
4298 targets--;
4299
4300 } else if (named_iofds[STDERR_FILENO] < 0 &&
4301 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4302 stdio_fdname[STDERR_FILENO] &&
4303 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4304
4305 named_iofds[STDERR_FILENO] = p->fds[i];
4306 targets--;
4307 }
4308
4309 return targets == 0 ? 0 : -ENOENT;
4310 }
4311
4312 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4313 char **i, **r = NULL;
4314
4315 assert(c);
4316 assert(l);
4317
4318 STRV_FOREACH(i, c->environment_files) {
4319 char *fn;
4320 int k;
4321 unsigned n;
4322 bool ignore = false;
4323 char **p;
4324 _cleanup_globfree_ glob_t pglob = {};
4325
4326 fn = *i;
4327
4328 if (fn[0] == '-') {
4329 ignore = true;
4330 fn++;
4331 }
4332
4333 if (!path_is_absolute(fn)) {
4334 if (ignore)
4335 continue;
4336
4337 strv_free(r);
4338 return -EINVAL;
4339 }
4340
4341 /* Filename supports globbing, take all matching files */
4342 k = safe_glob(fn, 0, &pglob);
4343 if (k < 0) {
4344 if (ignore)
4345 continue;
4346
4347 strv_free(r);
4348 return k;
4349 }
4350
4351 /* When we don't match anything, -ENOENT should be returned */
4352 assert(pglob.gl_pathc > 0);
4353
4354 for (n = 0; n < pglob.gl_pathc; n++) {
4355 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4356 if (k < 0) {
4357 if (ignore)
4358 continue;
4359
4360 strv_free(r);
4361 return k;
4362 }
4363 /* Log invalid environment variables with filename */
4364 if (p) {
4365 InvalidEnvInfo info = {
4366 .unit = unit,
4367 .path = pglob.gl_pathv[n]
4368 };
4369
4370 p = strv_env_clean_with_callback(p, invalid_env, &info);
4371 }
4372
4373 if (!r)
4374 r = p;
4375 else {
4376 char **m;
4377
4378 m = strv_env_merge(2, r, p);
4379 strv_free(r);
4380 strv_free(p);
4381 if (!m)
4382 return -ENOMEM;
4383
4384 r = m;
4385 }
4386 }
4387 }
4388
4389 *l = r;
4390
4391 return 0;
4392 }
4393
4394 static bool tty_may_match_dev_console(const char *tty) {
4395 _cleanup_free_ char *resolved = NULL;
4396
4397 if (!tty)
4398 return true;
4399
4400 tty = skip_dev_prefix(tty);
4401
4402 /* trivial identity? */
4403 if (streq(tty, "console"))
4404 return true;
4405
4406 if (resolve_dev_console(&resolved) < 0)
4407 return true; /* if we could not resolve, assume it may */
4408
4409 /* "tty0" means the active VC, so it may be the same sometimes */
4410 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4411 }
4412
4413 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4414 assert(ec);
4415
4416 return ec->tty_reset ||
4417 ec->tty_vhangup ||
4418 ec->tty_vt_disallocate ||
4419 is_terminal_input(ec->std_input) ||
4420 is_terminal_output(ec->std_output) ||
4421 is_terminal_output(ec->std_error);
4422 }
4423
4424 bool exec_context_may_touch_console(const ExecContext *ec) {
4425
4426 return exec_context_may_touch_tty(ec) &&
4427 tty_may_match_dev_console(exec_context_tty_path(ec));
4428 }
4429
4430 static void strv_fprintf(FILE *f, char **l) {
4431 char **g;
4432
4433 assert(f);
4434
4435 STRV_FOREACH(g, l)
4436 fprintf(f, " %s", *g);
4437 }
4438
4439 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4440 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4441 ExecDirectoryType dt;
4442 unsigned i;
4443 int r;
4444
4445 assert(c);
4446 assert(f);
4447
4448 prefix = strempty(prefix);
4449
4450 fprintf(f,
4451 "%sUMask: %04o\n"
4452 "%sWorkingDirectory: %s\n"
4453 "%sRootDirectory: %s\n"
4454 "%sNonBlocking: %s\n"
4455 "%sPrivateTmp: %s\n"
4456 "%sPrivateDevices: %s\n"
4457 "%sProtectKernelTunables: %s\n"
4458 "%sProtectKernelModules: %s\n"
4459 "%sProtectKernelLogs: %s\n"
4460 "%sProtectClock: %s\n"
4461 "%sProtectControlGroups: %s\n"
4462 "%sPrivateNetwork: %s\n"
4463 "%sPrivateUsers: %s\n"
4464 "%sProtectHome: %s\n"
4465 "%sProtectSystem: %s\n"
4466 "%sMountAPIVFS: %s\n"
4467 "%sIgnoreSIGPIPE: %s\n"
4468 "%sMemoryDenyWriteExecute: %s\n"
4469 "%sRestrictRealtime: %s\n"
4470 "%sRestrictSUIDSGID: %s\n"
4471 "%sKeyringMode: %s\n"
4472 "%sProtectHostname: %s\n",
4473 prefix, c->umask,
4474 prefix, c->working_directory ? c->working_directory : "/",
4475 prefix, c->root_directory ? c->root_directory : "/",
4476 prefix, yes_no(c->non_blocking),
4477 prefix, yes_no(c->private_tmp),
4478 prefix, yes_no(c->private_devices),
4479 prefix, yes_no(c->protect_kernel_tunables),
4480 prefix, yes_no(c->protect_kernel_modules),
4481 prefix, yes_no(c->protect_kernel_logs),
4482 prefix, yes_no(c->protect_clock),
4483 prefix, yes_no(c->protect_control_groups),
4484 prefix, yes_no(c->private_network),
4485 prefix, yes_no(c->private_users),
4486 prefix, protect_home_to_string(c->protect_home),
4487 prefix, protect_system_to_string(c->protect_system),
4488 prefix, yes_no(c->mount_apivfs),
4489 prefix, yes_no(c->ignore_sigpipe),
4490 prefix, yes_no(c->memory_deny_write_execute),
4491 prefix, yes_no(c->restrict_realtime),
4492 prefix, yes_no(c->restrict_suid_sgid),
4493 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4494 prefix, yes_no(c->protect_hostname));
4495
4496 if (c->root_image)
4497 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4498
4499 STRV_FOREACH(e, c->environment)
4500 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4501
4502 STRV_FOREACH(e, c->environment_files)
4503 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4504
4505 STRV_FOREACH(e, c->pass_environment)
4506 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4507
4508 STRV_FOREACH(e, c->unset_environment)
4509 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4510
4511 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4512
4513 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4514 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4515
4516 STRV_FOREACH(d, c->directories[dt].paths)
4517 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4518 }
4519
4520 fprintf(f,
4521 "%sTimeoutCleanSec: %s\n",
4522 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4523
4524 if (c->nice_set)
4525 fprintf(f,
4526 "%sNice: %i\n",
4527 prefix, c->nice);
4528
4529 if (c->oom_score_adjust_set)
4530 fprintf(f,
4531 "%sOOMScoreAdjust: %i\n",
4532 prefix, c->oom_score_adjust);
4533
4534 for (i = 0; i < RLIM_NLIMITS; i++)
4535 if (c->rlimit[i]) {
4536 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4537 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4538 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4539 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4540 }
4541
4542 if (c->ioprio_set) {
4543 _cleanup_free_ char *class_str = NULL;
4544
4545 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4546 if (r >= 0)
4547 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4548
4549 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4550 }
4551
4552 if (c->cpu_sched_set) {
4553 _cleanup_free_ char *policy_str = NULL;
4554
4555 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4556 if (r >= 0)
4557 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4558
4559 fprintf(f,
4560 "%sCPUSchedulingPriority: %i\n"
4561 "%sCPUSchedulingResetOnFork: %s\n",
4562 prefix, c->cpu_sched_priority,
4563 prefix, yes_no(c->cpu_sched_reset_on_fork));
4564 }
4565
4566 if (c->cpu_set.set) {
4567 _cleanup_free_ char *affinity = NULL;
4568
4569 affinity = cpu_set_to_range_string(&c->cpu_set);
4570 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4571 }
4572
4573 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4574 _cleanup_free_ char *nodes = NULL;
4575
4576 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4577 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4578 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4579 }
4580
4581 if (c->timer_slack_nsec != NSEC_INFINITY)
4582 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4583
4584 fprintf(f,
4585 "%sStandardInput: %s\n"
4586 "%sStandardOutput: %s\n"
4587 "%sStandardError: %s\n",
4588 prefix, exec_input_to_string(c->std_input),
4589 prefix, exec_output_to_string(c->std_output),
4590 prefix, exec_output_to_string(c->std_error));
4591
4592 if (c->std_input == EXEC_INPUT_NAMED_FD)
4593 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4594 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4595 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4596 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4597 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4598
4599 if (c->std_input == EXEC_INPUT_FILE)
4600 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4601 if (c->std_output == EXEC_OUTPUT_FILE)
4602 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4603 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4604 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4605 if (c->std_error == EXEC_OUTPUT_FILE)
4606 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4607 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4608 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4609
4610 if (c->tty_path)
4611 fprintf(f,
4612 "%sTTYPath: %s\n"
4613 "%sTTYReset: %s\n"
4614 "%sTTYVHangup: %s\n"
4615 "%sTTYVTDisallocate: %s\n",
4616 prefix, c->tty_path,
4617 prefix, yes_no(c->tty_reset),
4618 prefix, yes_no(c->tty_vhangup),
4619 prefix, yes_no(c->tty_vt_disallocate));
4620
4621 if (IN_SET(c->std_output,
4622 EXEC_OUTPUT_SYSLOG,
4623 EXEC_OUTPUT_KMSG,
4624 EXEC_OUTPUT_JOURNAL,
4625 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4626 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4627 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4628 IN_SET(c->std_error,
4629 EXEC_OUTPUT_SYSLOG,
4630 EXEC_OUTPUT_KMSG,
4631 EXEC_OUTPUT_JOURNAL,
4632 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4633 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4634 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4635
4636 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4637
4638 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4639 if (r >= 0)
4640 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4641
4642 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4643 if (r >= 0)
4644 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4645 }
4646
4647 if (c->log_level_max >= 0) {
4648 _cleanup_free_ char *t = NULL;
4649
4650 (void) log_level_to_string_alloc(c->log_level_max, &t);
4651
4652 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4653 }
4654
4655 if (c->log_ratelimit_interval_usec > 0) {
4656 char buf_timespan[FORMAT_TIMESPAN_MAX];
4657
4658 fprintf(f,
4659 "%sLogRateLimitIntervalSec: %s\n",
4660 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4661 }
4662
4663 if (c->log_ratelimit_burst > 0)
4664 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4665
4666 if (c->n_log_extra_fields > 0) {
4667 size_t j;
4668
4669 for (j = 0; j < c->n_log_extra_fields; j++) {
4670 fprintf(f, "%sLogExtraFields: ", prefix);
4671 fwrite(c->log_extra_fields[j].iov_base,
4672 1, c->log_extra_fields[j].iov_len,
4673 f);
4674 fputc('\n', f);
4675 }
4676 }
4677
4678 if (c->secure_bits) {
4679 _cleanup_free_ char *str = NULL;
4680
4681 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4682 if (r >= 0)
4683 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4684 }
4685
4686 if (c->capability_bounding_set != CAP_ALL) {
4687 _cleanup_free_ char *str = NULL;
4688
4689 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4690 if (r >= 0)
4691 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4692 }
4693
4694 if (c->capability_ambient_set != 0) {
4695 _cleanup_free_ char *str = NULL;
4696
4697 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4698 if (r >= 0)
4699 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4700 }
4701
4702 if (c->user)
4703 fprintf(f, "%sUser: %s\n", prefix, c->user);
4704 if (c->group)
4705 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4706
4707 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4708
4709 if (!strv_isempty(c->supplementary_groups)) {
4710 fprintf(f, "%sSupplementaryGroups:", prefix);
4711 strv_fprintf(f, c->supplementary_groups);
4712 fputs("\n", f);
4713 }
4714
4715 if (c->pam_name)
4716 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4717
4718 if (!strv_isempty(c->read_write_paths)) {
4719 fprintf(f, "%sReadWritePaths:", prefix);
4720 strv_fprintf(f, c->read_write_paths);
4721 fputs("\n", f);
4722 }
4723
4724 if (!strv_isempty(c->read_only_paths)) {
4725 fprintf(f, "%sReadOnlyPaths:", prefix);
4726 strv_fprintf(f, c->read_only_paths);
4727 fputs("\n", f);
4728 }
4729
4730 if (!strv_isempty(c->inaccessible_paths)) {
4731 fprintf(f, "%sInaccessiblePaths:", prefix);
4732 strv_fprintf(f, c->inaccessible_paths);
4733 fputs("\n", f);
4734 }
4735
4736 if (c->n_bind_mounts > 0)
4737 for (i = 0; i < c->n_bind_mounts; i++)
4738 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4739 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4740 c->bind_mounts[i].ignore_enoent ? "-": "",
4741 c->bind_mounts[i].source,
4742 c->bind_mounts[i].destination,
4743 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4744
4745 if (c->n_temporary_filesystems > 0)
4746 for (i = 0; i < c->n_temporary_filesystems; i++) {
4747 TemporaryFileSystem *t = c->temporary_filesystems + i;
4748
4749 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4750 t->path,
4751 isempty(t->options) ? "" : ":",
4752 strempty(t->options));
4753 }
4754
4755 if (c->utmp_id)
4756 fprintf(f,
4757 "%sUtmpIdentifier: %s\n",
4758 prefix, c->utmp_id);
4759
4760 if (c->selinux_context)
4761 fprintf(f,
4762 "%sSELinuxContext: %s%s\n",
4763 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4764
4765 if (c->apparmor_profile)
4766 fprintf(f,
4767 "%sAppArmorProfile: %s%s\n",
4768 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4769
4770 if (c->smack_process_label)
4771 fprintf(f,
4772 "%sSmackProcessLabel: %s%s\n",
4773 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4774
4775 if (c->personality != PERSONALITY_INVALID)
4776 fprintf(f,
4777 "%sPersonality: %s\n",
4778 prefix, strna(personality_to_string(c->personality)));
4779
4780 fprintf(f,
4781 "%sLockPersonality: %s\n",
4782 prefix, yes_no(c->lock_personality));
4783
4784 if (c->syscall_filter) {
4785 #if HAVE_SECCOMP
4786 Iterator j;
4787 void *id, *val;
4788 bool first = true;
4789 #endif
4790
4791 fprintf(f,
4792 "%sSystemCallFilter: ",
4793 prefix);
4794
4795 if (!c->syscall_whitelist)
4796 fputc('~', f);
4797
4798 #if HAVE_SECCOMP
4799 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4800 _cleanup_free_ char *name = NULL;
4801 const char *errno_name = NULL;
4802 int num = PTR_TO_INT(val);
4803
4804 if (first)
4805 first = false;
4806 else
4807 fputc(' ', f);
4808
4809 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4810 fputs(strna(name), f);
4811
4812 if (num >= 0) {
4813 errno_name = errno_to_name(num);
4814 if (errno_name)
4815 fprintf(f, ":%s", errno_name);
4816 else
4817 fprintf(f, ":%d", num);
4818 }
4819 }
4820 #endif
4821
4822 fputc('\n', f);
4823 }
4824
4825 if (c->syscall_archs) {
4826 #if HAVE_SECCOMP
4827 Iterator j;
4828 void *id;
4829 #endif
4830
4831 fprintf(f,
4832 "%sSystemCallArchitectures:",
4833 prefix);
4834
4835 #if HAVE_SECCOMP
4836 SET_FOREACH(id, c->syscall_archs, j)
4837 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4838 #endif
4839 fputc('\n', f);
4840 }
4841
4842 if (exec_context_restrict_namespaces_set(c)) {
4843 _cleanup_free_ char *s = NULL;
4844
4845 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4846 if (r >= 0)
4847 fprintf(f, "%sRestrictNamespaces: %s\n",
4848 prefix, s);
4849 }
4850
4851 if (c->network_namespace_path)
4852 fprintf(f,
4853 "%sNetworkNamespacePath: %s\n",
4854 prefix, c->network_namespace_path);
4855
4856 if (c->syscall_errno > 0) {
4857 const char *errno_name;
4858
4859 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4860
4861 errno_name = errno_to_name(c->syscall_errno);
4862 if (errno_name)
4863 fprintf(f, "%s\n", errno_name);
4864 else
4865 fprintf(f, "%d\n", c->syscall_errno);
4866 }
4867 }
4868
4869 bool exec_context_maintains_privileges(const ExecContext *c) {
4870 assert(c);
4871
4872 /* Returns true if the process forked off would run under
4873 * an unchanged UID or as root. */
4874
4875 if (!c->user)
4876 return true;
4877
4878 if (streq(c->user, "root") || streq(c->user, "0"))
4879 return true;
4880
4881 return false;
4882 }
4883
4884 int exec_context_get_effective_ioprio(const ExecContext *c) {
4885 int p;
4886
4887 assert(c);
4888
4889 if (c->ioprio_set)
4890 return c->ioprio;
4891
4892 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4893 if (p < 0)
4894 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4895
4896 return p;
4897 }
4898
4899 void exec_context_free_log_extra_fields(ExecContext *c) {
4900 size_t l;
4901
4902 assert(c);
4903
4904 for (l = 0; l < c->n_log_extra_fields; l++)
4905 free(c->log_extra_fields[l].iov_base);
4906 c->log_extra_fields = mfree(c->log_extra_fields);
4907 c->n_log_extra_fields = 0;
4908 }
4909
4910 void exec_context_revert_tty(ExecContext *c) {
4911 int r;
4912
4913 assert(c);
4914
4915 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4916 exec_context_tty_reset(c, NULL);
4917
4918 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4919 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4920 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4921
4922 if (exec_context_may_touch_tty(c)) {
4923 const char *path;
4924
4925 path = exec_context_tty_path(c);
4926 if (path) {
4927 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4928 if (r < 0 && r != -ENOENT)
4929 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4930 }
4931 }
4932 }
4933
4934 int exec_context_get_clean_directories(
4935 ExecContext *c,
4936 char **prefix,
4937 ExecCleanMask mask,
4938 char ***ret) {
4939
4940 _cleanup_strv_free_ char **l = NULL;
4941 ExecDirectoryType t;
4942 int r;
4943
4944 assert(c);
4945 assert(prefix);
4946 assert(ret);
4947
4948 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4949 char **i;
4950
4951 if (!FLAGS_SET(mask, 1U << t))
4952 continue;
4953
4954 if (!prefix[t])
4955 continue;
4956
4957 STRV_FOREACH(i, c->directories[t].paths) {
4958 char *j;
4959
4960 j = path_join(prefix[t], *i);
4961 if (!j)
4962 return -ENOMEM;
4963
4964 r = strv_consume(&l, j);
4965 if (r < 0)
4966 return r;
4967
4968 /* Also remove private directories unconditionally. */
4969 if (t != EXEC_DIRECTORY_CONFIGURATION) {
4970 j = path_join(prefix[t], "private", *i);
4971 if (!j)
4972 return -ENOMEM;
4973
4974 r = strv_consume(&l, j);
4975 if (r < 0)
4976 return r;
4977 }
4978 }
4979 }
4980
4981 *ret = TAKE_PTR(l);
4982 return 0;
4983 }
4984
4985 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
4986 ExecCleanMask mask = 0;
4987
4988 assert(c);
4989 assert(ret);
4990
4991 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4992 if (!strv_isempty(c->directories[t].paths))
4993 mask |= 1U << t;
4994
4995 *ret = mask;
4996 return 0;
4997 }
4998
4999 void exec_status_start(ExecStatus *s, pid_t pid) {
5000 assert(s);
5001
5002 *s = (ExecStatus) {
5003 .pid = pid,
5004 };
5005
5006 dual_timestamp_get(&s->start_timestamp);
5007 }
5008
5009 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5010 assert(s);
5011
5012 if (s->pid != pid) {
5013 *s = (ExecStatus) {
5014 .pid = pid,
5015 };
5016 }
5017
5018 dual_timestamp_get(&s->exit_timestamp);
5019
5020 s->code = code;
5021 s->status = status;
5022
5023 if (context && context->utmp_id)
5024 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5025 }
5026
5027 void exec_status_reset(ExecStatus *s) {
5028 assert(s);
5029
5030 *s = (ExecStatus) {};
5031 }
5032
5033 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5034 char buf[FORMAT_TIMESTAMP_MAX];
5035
5036 assert(s);
5037 assert(f);
5038
5039 if (s->pid <= 0)
5040 return;
5041
5042 prefix = strempty(prefix);
5043
5044 fprintf(f,
5045 "%sPID: "PID_FMT"\n",
5046 prefix, s->pid);
5047
5048 if (dual_timestamp_is_set(&s->start_timestamp))
5049 fprintf(f,
5050 "%sStart Timestamp: %s\n",
5051 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5052
5053 if (dual_timestamp_is_set(&s->exit_timestamp))
5054 fprintf(f,
5055 "%sExit Timestamp: %s\n"
5056 "%sExit Code: %s\n"
5057 "%sExit Status: %i\n",
5058 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5059 prefix, sigchld_code_to_string(s->code),
5060 prefix, s->status);
5061 }
5062
5063 static char *exec_command_line(char **argv) {
5064 size_t k;
5065 char *n, *p, **a;
5066 bool first = true;
5067
5068 assert(argv);
5069
5070 k = 1;
5071 STRV_FOREACH(a, argv)
5072 k += strlen(*a)+3;
5073
5074 n = new(char, k);
5075 if (!n)
5076 return NULL;
5077
5078 p = n;
5079 STRV_FOREACH(a, argv) {
5080
5081 if (!first)
5082 *(p++) = ' ';
5083 else
5084 first = false;
5085
5086 if (strpbrk(*a, WHITESPACE)) {
5087 *(p++) = '\'';
5088 p = stpcpy(p, *a);
5089 *(p++) = '\'';
5090 } else
5091 p = stpcpy(p, *a);
5092
5093 }
5094
5095 *p = 0;
5096
5097 /* FIXME: this doesn't really handle arguments that have
5098 * spaces and ticks in them */
5099
5100 return n;
5101 }
5102
5103 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5104 _cleanup_free_ char *cmd = NULL;
5105 const char *prefix2;
5106
5107 assert(c);
5108 assert(f);
5109
5110 prefix = strempty(prefix);
5111 prefix2 = strjoina(prefix, "\t");
5112
5113 cmd = exec_command_line(c->argv);
5114 fprintf(f,
5115 "%sCommand Line: %s\n",
5116 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5117
5118 exec_status_dump(&c->exec_status, f, prefix2);
5119 }
5120
5121 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5122 assert(f);
5123
5124 prefix = strempty(prefix);
5125
5126 LIST_FOREACH(command, c, c)
5127 exec_command_dump(c, f, prefix);
5128 }
5129
5130 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5131 ExecCommand *end;
5132
5133 assert(l);
5134 assert(e);
5135
5136 if (*l) {
5137 /* It's kind of important, that we keep the order here */
5138 LIST_FIND_TAIL(command, *l, end);
5139 LIST_INSERT_AFTER(command, *l, end, e);
5140 } else
5141 *l = e;
5142 }
5143
5144 int exec_command_set(ExecCommand *c, const char *path, ...) {
5145 va_list ap;
5146 char **l, *p;
5147
5148 assert(c);
5149 assert(path);
5150
5151 va_start(ap, path);
5152 l = strv_new_ap(path, ap);
5153 va_end(ap);
5154
5155 if (!l)
5156 return -ENOMEM;
5157
5158 p = strdup(path);
5159 if (!p) {
5160 strv_free(l);
5161 return -ENOMEM;
5162 }
5163
5164 free_and_replace(c->path, p);
5165
5166 return strv_free_and_replace(c->argv, l);
5167 }
5168
5169 int exec_command_append(ExecCommand *c, const char *path, ...) {
5170 _cleanup_strv_free_ char **l = NULL;
5171 va_list ap;
5172 int r;
5173
5174 assert(c);
5175 assert(path);
5176
5177 va_start(ap, path);
5178 l = strv_new_ap(path, ap);
5179 va_end(ap);
5180
5181 if (!l)
5182 return -ENOMEM;
5183
5184 r = strv_extend_strv(&c->argv, l, false);
5185 if (r < 0)
5186 return r;
5187
5188 return 0;
5189 }
5190
5191 static void *remove_tmpdir_thread(void *p) {
5192 _cleanup_free_ char *path = p;
5193
5194 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5195 return NULL;
5196 }
5197
5198 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5199 int r;
5200
5201 if (!rt)
5202 return NULL;
5203
5204 if (rt->manager)
5205 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5206
5207 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5208 if (destroy && rt->tmp_dir) {
5209 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5210
5211 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5212 if (r < 0) {
5213 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5214 free(rt->tmp_dir);
5215 }
5216
5217 rt->tmp_dir = NULL;
5218 }
5219
5220 if (destroy && rt->var_tmp_dir) {
5221 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5222
5223 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5224 if (r < 0) {
5225 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5226 free(rt->var_tmp_dir);
5227 }
5228
5229 rt->var_tmp_dir = NULL;
5230 }
5231
5232 rt->id = mfree(rt->id);
5233 rt->tmp_dir = mfree(rt->tmp_dir);
5234 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5235 safe_close_pair(rt->netns_storage_socket);
5236 return mfree(rt);
5237 }
5238
5239 static void exec_runtime_freep(ExecRuntime **rt) {
5240 (void) exec_runtime_free(*rt, false);
5241 }
5242
5243 static int exec_runtime_allocate(ExecRuntime **ret) {
5244 ExecRuntime *n;
5245
5246 assert(ret);
5247
5248 n = new(ExecRuntime, 1);
5249 if (!n)
5250 return -ENOMEM;
5251
5252 *n = (ExecRuntime) {
5253 .netns_storage_socket = { -1, -1 },
5254 };
5255
5256 *ret = n;
5257 return 0;
5258 }
5259
5260 static int exec_runtime_add(
5261 Manager *m,
5262 const char *id,
5263 const char *tmp_dir,
5264 const char *var_tmp_dir,
5265 const int netns_storage_socket[2],
5266 ExecRuntime **ret) {
5267
5268 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5269 int r;
5270
5271 assert(m);
5272 assert(id);
5273
5274 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5275 if (r < 0)
5276 return r;
5277
5278 r = exec_runtime_allocate(&rt);
5279 if (r < 0)
5280 return r;
5281
5282 rt->id = strdup(id);
5283 if (!rt->id)
5284 return -ENOMEM;
5285
5286 if (tmp_dir) {
5287 rt->tmp_dir = strdup(tmp_dir);
5288 if (!rt->tmp_dir)
5289 return -ENOMEM;
5290
5291 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5292 assert(var_tmp_dir);
5293 rt->var_tmp_dir = strdup(var_tmp_dir);
5294 if (!rt->var_tmp_dir)
5295 return -ENOMEM;
5296 }
5297
5298 if (netns_storage_socket) {
5299 rt->netns_storage_socket[0] = netns_storage_socket[0];
5300 rt->netns_storage_socket[1] = netns_storage_socket[1];
5301 }
5302
5303 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5304 if (r < 0)
5305 return r;
5306
5307 rt->manager = m;
5308
5309 if (ret)
5310 *ret = rt;
5311
5312 /* do not remove created ExecRuntime object when the operation succeeds. */
5313 rt = NULL;
5314 return 0;
5315 }
5316
5317 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5318 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5319 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5320 int r;
5321
5322 assert(m);
5323 assert(c);
5324 assert(id);
5325
5326 /* It is not necessary to create ExecRuntime object. */
5327 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5328 return 0;
5329
5330 if (c->private_tmp) {
5331 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5332 if (r < 0)
5333 return r;
5334 }
5335
5336 if (c->private_network || c->network_namespace_path) {
5337 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5338 return -errno;
5339 }
5340
5341 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5342 if (r < 0)
5343 return r;
5344
5345 /* Avoid cleanup */
5346 netns_storage_socket[0] = netns_storage_socket[1] = -1;
5347 return 1;
5348 }
5349
5350 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5351 ExecRuntime *rt;
5352 int r;
5353
5354 assert(m);
5355 assert(id);
5356 assert(ret);
5357
5358 rt = hashmap_get(m->exec_runtime_by_id, id);
5359 if (rt)
5360 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5361 goto ref;
5362
5363 if (!create)
5364 return 0;
5365
5366 /* If not found, then create a new object. */
5367 r = exec_runtime_make(m, c, id, &rt);
5368 if (r <= 0)
5369 /* When r == 0, it is not necessary to create ExecRuntime object. */
5370 return r;
5371
5372 ref:
5373 /* increment reference counter. */
5374 rt->n_ref++;
5375 *ret = rt;
5376 return 1;
5377 }
5378
5379 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5380 if (!rt)
5381 return NULL;
5382
5383 assert(rt->n_ref > 0);
5384
5385 rt->n_ref--;
5386 if (rt->n_ref > 0)
5387 return NULL;
5388
5389 return exec_runtime_free(rt, destroy);
5390 }
5391
5392 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5393 ExecRuntime *rt;
5394 Iterator i;
5395
5396 assert(m);
5397 assert(f);
5398 assert(fds);
5399
5400 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5401 fprintf(f, "exec-runtime=%s", rt->id);
5402
5403 if (rt->tmp_dir)
5404 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5405
5406 if (rt->var_tmp_dir)
5407 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5408
5409 if (rt->netns_storage_socket[0] >= 0) {
5410 int copy;
5411
5412 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5413 if (copy < 0)
5414 return copy;
5415
5416 fprintf(f, " netns-socket-0=%i", copy);
5417 }
5418
5419 if (rt->netns_storage_socket[1] >= 0) {
5420 int copy;
5421
5422 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5423 if (copy < 0)
5424 return copy;
5425
5426 fprintf(f, " netns-socket-1=%i", copy);
5427 }
5428
5429 fputc('\n', f);
5430 }
5431
5432 return 0;
5433 }
5434
5435 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5436 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5437 ExecRuntime *rt;
5438 int r;
5439
5440 /* This is for the migration from old (v237 or earlier) deserialization text.
5441 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5442 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5443 * so or not from the serialized text, then we always creates a new object owned by this. */
5444
5445 assert(u);
5446 assert(key);
5447 assert(value);
5448
5449 /* Manager manages ExecRuntime objects by the unit id.
5450 * So, we omit the serialized text when the unit does not have id (yet?)... */
5451 if (isempty(u->id)) {
5452 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5453 return 0;
5454 }
5455
5456 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5457 if (r < 0) {
5458 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5459 return 0;
5460 }
5461
5462 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5463 if (!rt) {
5464 r = exec_runtime_allocate(&rt_create);
5465 if (r < 0)
5466 return log_oom();
5467
5468 rt_create->id = strdup(u->id);
5469 if (!rt_create->id)
5470 return log_oom();
5471
5472 rt = rt_create;
5473 }
5474
5475 if (streq(key, "tmp-dir")) {
5476 char *copy;
5477
5478 copy = strdup(value);
5479 if (!copy)
5480 return log_oom();
5481
5482 free_and_replace(rt->tmp_dir, copy);
5483
5484 } else if (streq(key, "var-tmp-dir")) {
5485 char *copy;
5486
5487 copy = strdup(value);
5488 if (!copy)
5489 return log_oom();
5490
5491 free_and_replace(rt->var_tmp_dir, copy);
5492
5493 } else if (streq(key, "netns-socket-0")) {
5494 int fd;
5495
5496 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5497 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5498 return 0;
5499 }
5500
5501 safe_close(rt->netns_storage_socket[0]);
5502 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5503
5504 } else if (streq(key, "netns-socket-1")) {
5505 int fd;
5506
5507 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5508 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5509 return 0;
5510 }
5511
5512 safe_close(rt->netns_storage_socket[1]);
5513 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5514 } else
5515 return 0;
5516
5517 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5518 if (rt_create) {
5519 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5520 if (r < 0) {
5521 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5522 return 0;
5523 }
5524
5525 rt_create->manager = u->manager;
5526
5527 /* Avoid cleanup */
5528 rt_create = NULL;
5529 }
5530
5531 return 1;
5532 }
5533
5534 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5535 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5536 int r, fd0 = -1, fd1 = -1;
5537 const char *p, *v = value;
5538 size_t n;
5539
5540 assert(m);
5541 assert(value);
5542 assert(fds);
5543
5544 n = strcspn(v, " ");
5545 id = strndupa(v, n);
5546 if (v[n] != ' ')
5547 goto finalize;
5548 p = v + n + 1;
5549
5550 v = startswith(p, "tmp-dir=");
5551 if (v) {
5552 n = strcspn(v, " ");
5553 tmp_dir = strndupa(v, n);
5554 if (v[n] != ' ')
5555 goto finalize;
5556 p = v + n + 1;
5557 }
5558
5559 v = startswith(p, "var-tmp-dir=");
5560 if (v) {
5561 n = strcspn(v, " ");
5562 var_tmp_dir = strndupa(v, n);
5563 if (v[n] != ' ')
5564 goto finalize;
5565 p = v + n + 1;
5566 }
5567
5568 v = startswith(p, "netns-socket-0=");
5569 if (v) {
5570 char *buf;
5571
5572 n = strcspn(v, " ");
5573 buf = strndupa(v, n);
5574 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5575 log_debug("Unable to process exec-runtime netns fd specification.");
5576 return;
5577 }
5578 fd0 = fdset_remove(fds, fd0);
5579 if (v[n] != ' ')
5580 goto finalize;
5581 p = v + n + 1;
5582 }
5583
5584 v = startswith(p, "netns-socket-1=");
5585 if (v) {
5586 char *buf;
5587
5588 n = strcspn(v, " ");
5589 buf = strndupa(v, n);
5590 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5591 log_debug("Unable to process exec-runtime netns fd specification.");
5592 return;
5593 }
5594 fd1 = fdset_remove(fds, fd1);
5595 }
5596
5597 finalize:
5598
5599 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5600 if (r < 0)
5601 log_debug_errno(r, "Failed to add exec-runtime: %m");
5602 }
5603
5604 void exec_runtime_vacuum(Manager *m) {
5605 ExecRuntime *rt;
5606 Iterator i;
5607
5608 assert(m);
5609
5610 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5611
5612 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5613 if (rt->n_ref > 0)
5614 continue;
5615
5616 (void) exec_runtime_free(rt, false);
5617 }
5618 }
5619
5620 void exec_params_clear(ExecParameters *p) {
5621 if (!p)
5622 return;
5623
5624 strv_free(p->environment);
5625 }
5626
5627 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5628 [EXEC_INPUT_NULL] = "null",
5629 [EXEC_INPUT_TTY] = "tty",
5630 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5631 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5632 [EXEC_INPUT_SOCKET] = "socket",
5633 [EXEC_INPUT_NAMED_FD] = "fd",
5634 [EXEC_INPUT_DATA] = "data",
5635 [EXEC_INPUT_FILE] = "file",
5636 };
5637
5638 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5639
5640 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5641 [EXEC_OUTPUT_INHERIT] = "inherit",
5642 [EXEC_OUTPUT_NULL] = "null",
5643 [EXEC_OUTPUT_TTY] = "tty",
5644 [EXEC_OUTPUT_SYSLOG] = "syslog",
5645 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5646 [EXEC_OUTPUT_KMSG] = "kmsg",
5647 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5648 [EXEC_OUTPUT_JOURNAL] = "journal",
5649 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5650 [EXEC_OUTPUT_SOCKET] = "socket",
5651 [EXEC_OUTPUT_NAMED_FD] = "fd",
5652 [EXEC_OUTPUT_FILE] = "file",
5653 [EXEC_OUTPUT_FILE_APPEND] = "append",
5654 };
5655
5656 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5657
5658 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5659 [EXEC_UTMP_INIT] = "init",
5660 [EXEC_UTMP_LOGIN] = "login",
5661 [EXEC_UTMP_USER] = "user",
5662 };
5663
5664 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5665
5666 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5667 [EXEC_PRESERVE_NO] = "no",
5668 [EXEC_PRESERVE_YES] = "yes",
5669 [EXEC_PRESERVE_RESTART] = "restart",
5670 };
5671
5672 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5673
5674 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5675 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5676 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5677 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5678 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5679 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5680 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5681 };
5682
5683 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5684
5685 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5686 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5687 * directories, specifically .timer units with their timestamp touch file. */
5688 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5689 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5690 [EXEC_DIRECTORY_STATE] = "state",
5691 [EXEC_DIRECTORY_CACHE] = "cache",
5692 [EXEC_DIRECTORY_LOGS] = "logs",
5693 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5694 };
5695
5696 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5697
5698 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5699 * the service payload in. */
5700 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5701 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5702 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5703 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5704 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5705 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5706 };
5707
5708 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5709
5710 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5711 [EXEC_KEYRING_INHERIT] = "inherit",
5712 [EXEC_KEYRING_PRIVATE] = "private",
5713 [EXEC_KEYRING_SHARED] = "shared",
5714 };
5715
5716 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);