]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #13246 from keszybz/add-SystemdOptions-efi-variable
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <glob.h>
6 #include <grp.h>
7 #include <poll.h>
8 #include <signal.h>
9 #include <string.h>
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
12 #include <sys/mman.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/shm.h>
16 #include <sys/socket.h>
17 #include <sys/stat.h>
18 #include <sys/types.h>
19 #include <sys/un.h>
20 #include <unistd.h>
21 #include <utmpx.h>
22
23 #if HAVE_PAM
24 #include <security/pam_appl.h>
25 #endif
26
27 #if HAVE_SELINUX
28 #include <selinux/selinux.h>
29 #endif
30
31 #if HAVE_SECCOMP
32 #include <seccomp.h>
33 #endif
34
35 #if HAVE_APPARMOR
36 #include <sys/apparmor.h>
37 #endif
38
39 #include "sd-messages.h"
40
41 #include "af-list.h"
42 #include "alloc-util.h"
43 #if HAVE_APPARMOR
44 #include "apparmor-util.h"
45 #endif
46 #include "async.h"
47 #include "barrier.h"
48 #include "cap-list.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cgroup-setup.h"
52 #include "cpu-set-util.h"
53 #include "def.h"
54 #include "env-file.h"
55 #include "env-util.h"
56 #include "errno-list.h"
57 #include "execute.h"
58 #include "exit-status.h"
59 #include "fd-util.h"
60 #include "format-util.h"
61 #include "fs-util.h"
62 #include "glob-util.h"
63 #include "io-util.h"
64 #include "ioprio.h"
65 #include "label.h"
66 #include "log.h"
67 #include "macro.h"
68 #include "manager.h"
69 #include "memory-util.h"
70 #include "missing.h"
71 #include "mkdir.h"
72 #include "namespace.h"
73 #include "parse-util.h"
74 #include "path-util.h"
75 #include "process-util.h"
76 #include "rlimit-util.h"
77 #include "rm-rf.h"
78 #if HAVE_SECCOMP
79 #include "seccomp-util.h"
80 #endif
81 #include "securebits-util.h"
82 #include "selinux-util.h"
83 #include "signal-util.h"
84 #include "smack-util.h"
85 #include "socket-util.h"
86 #include "special.h"
87 #include "stat-util.h"
88 #include "string-table.h"
89 #include "string-util.h"
90 #include "strv.h"
91 #include "syslog-util.h"
92 #include "terminal-util.h"
93 #include "umask-util.h"
94 #include "unit.h"
95 #include "user-util.h"
96 #include "utmp-wtmp.h"
97
98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
100
101 #define SNDBUF_SIZE (8*1024*1024)
102
103 static int shift_fds(int fds[], size_t n_fds) {
104 int start, restart_from;
105
106 if (n_fds <= 0)
107 return 0;
108
109 /* Modifies the fds array! (sorts it) */
110
111 assert(fds);
112
113 start = 0;
114 for (;;) {
115 int i;
116
117 restart_from = -1;
118
119 for (i = start; i < (int) n_fds; i++) {
120 int nfd;
121
122 /* Already at right index? */
123 if (fds[i] == i+3)
124 continue;
125
126 nfd = fcntl(fds[i], F_DUPFD, i + 3);
127 if (nfd < 0)
128 return -errno;
129
130 safe_close(fds[i]);
131 fds[i] = nfd;
132
133 /* Hmm, the fd we wanted isn't free? Then
134 * let's remember that and try again from here */
135 if (nfd != i+3 && restart_from < 0)
136 restart_from = i;
137 }
138
139 if (restart_from < 0)
140 break;
141
142 start = restart_from;
143 }
144
145 return 0;
146 }
147
148 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
149 size_t i, n_fds;
150 int r;
151
152 n_fds = n_socket_fds + n_storage_fds;
153 if (n_fds <= 0)
154 return 0;
155
156 assert(fds);
157
158 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
159 * O_NONBLOCK only applies to socket activation though. */
160
161 for (i = 0; i < n_fds; i++) {
162
163 if (i < n_socket_fds) {
164 r = fd_nonblock(fds[i], nonblock);
165 if (r < 0)
166 return r;
167 }
168
169 /* We unconditionally drop FD_CLOEXEC from the fds,
170 * since after all we want to pass these fds to our
171 * children */
172
173 r = fd_cloexec(fds[i], false);
174 if (r < 0)
175 return r;
176 }
177
178 return 0;
179 }
180
181 static const char *exec_context_tty_path(const ExecContext *context) {
182 assert(context);
183
184 if (context->stdio_as_fds)
185 return NULL;
186
187 if (context->tty_path)
188 return context->tty_path;
189
190 return "/dev/console";
191 }
192
193 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
194 const char *path;
195
196 assert(context);
197
198 path = exec_context_tty_path(context);
199
200 if (context->tty_vhangup) {
201 if (p && p->stdin_fd >= 0)
202 (void) terminal_vhangup_fd(p->stdin_fd);
203 else if (path)
204 (void) terminal_vhangup(path);
205 }
206
207 if (context->tty_reset) {
208 if (p && p->stdin_fd >= 0)
209 (void) reset_terminal_fd(p->stdin_fd, true);
210 else if (path)
211 (void) reset_terminal(path);
212 }
213
214 if (context->tty_vt_disallocate && path)
215 (void) vt_disallocate(path);
216 }
217
218 static bool is_terminal_input(ExecInput i) {
219 return IN_SET(i,
220 EXEC_INPUT_TTY,
221 EXEC_INPUT_TTY_FORCE,
222 EXEC_INPUT_TTY_FAIL);
223 }
224
225 static bool is_terminal_output(ExecOutput o) {
226 return IN_SET(o,
227 EXEC_OUTPUT_TTY,
228 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
229 EXEC_OUTPUT_KMSG_AND_CONSOLE,
230 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
231 }
232
233 static bool is_syslog_output(ExecOutput o) {
234 return IN_SET(o,
235 EXEC_OUTPUT_SYSLOG,
236 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
237 }
238
239 static bool is_kmsg_output(ExecOutput o) {
240 return IN_SET(o,
241 EXEC_OUTPUT_KMSG,
242 EXEC_OUTPUT_KMSG_AND_CONSOLE);
243 }
244
245 static bool exec_context_needs_term(const ExecContext *c) {
246 assert(c);
247
248 /* Return true if the execution context suggests we should set $TERM to something useful. */
249
250 if (is_terminal_input(c->std_input))
251 return true;
252
253 if (is_terminal_output(c->std_output))
254 return true;
255
256 if (is_terminal_output(c->std_error))
257 return true;
258
259 return !!c->tty_path;
260 }
261
262 static int open_null_as(int flags, int nfd) {
263 int fd;
264
265 assert(nfd >= 0);
266
267 fd = open("/dev/null", flags|O_NOCTTY);
268 if (fd < 0)
269 return -errno;
270
271 return move_fd(fd, nfd, false);
272 }
273
274 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
275 static const union sockaddr_union sa = {
276 .un.sun_family = AF_UNIX,
277 .un.sun_path = "/run/systemd/journal/stdout",
278 };
279 uid_t olduid = UID_INVALID;
280 gid_t oldgid = GID_INVALID;
281 int r;
282
283 if (gid_is_valid(gid)) {
284 oldgid = getgid();
285
286 if (setegid(gid) < 0)
287 return -errno;
288 }
289
290 if (uid_is_valid(uid)) {
291 olduid = getuid();
292
293 if (seteuid(uid) < 0) {
294 r = -errno;
295 goto restore_gid;
296 }
297 }
298
299 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
300
301 /* If we fail to restore the uid or gid, things will likely
302 fail later on. This should only happen if an LSM interferes. */
303
304 if (uid_is_valid(uid))
305 (void) seteuid(olduid);
306
307 restore_gid:
308 if (gid_is_valid(gid))
309 (void) setegid(oldgid);
310
311 return r;
312 }
313
314 static int connect_logger_as(
315 const Unit *unit,
316 const ExecContext *context,
317 const ExecParameters *params,
318 ExecOutput output,
319 const char *ident,
320 int nfd,
321 uid_t uid,
322 gid_t gid) {
323
324 _cleanup_close_ int fd = -1;
325 int r;
326
327 assert(context);
328 assert(params);
329 assert(output < _EXEC_OUTPUT_MAX);
330 assert(ident);
331 assert(nfd >= 0);
332
333 fd = socket(AF_UNIX, SOCK_STREAM, 0);
334 if (fd < 0)
335 return -errno;
336
337 r = connect_journal_socket(fd, uid, gid);
338 if (r < 0)
339 return r;
340
341 if (shutdown(fd, SHUT_RD) < 0)
342 return -errno;
343
344 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
345
346 if (dprintf(fd,
347 "%s\n"
348 "%s\n"
349 "%i\n"
350 "%i\n"
351 "%i\n"
352 "%i\n"
353 "%i\n",
354 context->syslog_identifier ?: ident,
355 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
356 context->syslog_priority,
357 !!context->syslog_level_prefix,
358 is_syslog_output(output),
359 is_kmsg_output(output),
360 is_terminal_output(output)) < 0)
361 return -errno;
362
363 return move_fd(TAKE_FD(fd), nfd, false);
364 }
365
366 static int open_terminal_as(const char *path, int flags, int nfd) {
367 int fd;
368
369 assert(path);
370 assert(nfd >= 0);
371
372 fd = open_terminal(path, flags | O_NOCTTY);
373 if (fd < 0)
374 return fd;
375
376 return move_fd(fd, nfd, false);
377 }
378
379 static int acquire_path(const char *path, int flags, mode_t mode) {
380 union sockaddr_union sa = {};
381 _cleanup_close_ int fd = -1;
382 int r, salen;
383
384 assert(path);
385
386 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
387 flags |= O_CREAT;
388
389 fd = open(path, flags|O_NOCTTY, mode);
390 if (fd >= 0)
391 return TAKE_FD(fd);
392
393 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
394 return -errno;
395 if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
396 return -ENXIO;
397
398 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
399
400 fd = socket(AF_UNIX, SOCK_STREAM, 0);
401 if (fd < 0)
402 return -errno;
403
404 salen = sockaddr_un_set_path(&sa.un, path);
405 if (salen < 0)
406 return salen;
407
408 if (connect(fd, &sa.sa, salen) < 0)
409 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
410 * indication that his wasn't an AF_UNIX socket after all */
411
412 if ((flags & O_ACCMODE) == O_RDONLY)
413 r = shutdown(fd, SHUT_WR);
414 else if ((flags & O_ACCMODE) == O_WRONLY)
415 r = shutdown(fd, SHUT_RD);
416 else
417 return TAKE_FD(fd);
418 if (r < 0)
419 return -errno;
420
421 return TAKE_FD(fd);
422 }
423
424 static int fixup_input(
425 const ExecContext *context,
426 int socket_fd,
427 bool apply_tty_stdin) {
428
429 ExecInput std_input;
430
431 assert(context);
432
433 std_input = context->std_input;
434
435 if (is_terminal_input(std_input) && !apply_tty_stdin)
436 return EXEC_INPUT_NULL;
437
438 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
439 return EXEC_INPUT_NULL;
440
441 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
442 return EXEC_INPUT_NULL;
443
444 return std_input;
445 }
446
447 static int fixup_output(ExecOutput std_output, int socket_fd) {
448
449 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
450 return EXEC_OUTPUT_INHERIT;
451
452 return std_output;
453 }
454
455 static int setup_input(
456 const ExecContext *context,
457 const ExecParameters *params,
458 int socket_fd,
459 const int named_iofds[static 3]) {
460
461 ExecInput i;
462
463 assert(context);
464 assert(params);
465 assert(named_iofds);
466
467 if (params->stdin_fd >= 0) {
468 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
469 return -errno;
470
471 /* Try to make this the controlling tty, if it is a tty, and reset it */
472 if (isatty(STDIN_FILENO)) {
473 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
474 (void) reset_terminal_fd(STDIN_FILENO, true);
475 }
476
477 return STDIN_FILENO;
478 }
479
480 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
481
482 switch (i) {
483
484 case EXEC_INPUT_NULL:
485 return open_null_as(O_RDONLY, STDIN_FILENO);
486
487 case EXEC_INPUT_TTY:
488 case EXEC_INPUT_TTY_FORCE:
489 case EXEC_INPUT_TTY_FAIL: {
490 int fd;
491
492 fd = acquire_terminal(exec_context_tty_path(context),
493 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
494 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
495 ACQUIRE_TERMINAL_WAIT,
496 USEC_INFINITY);
497 if (fd < 0)
498 return fd;
499
500 return move_fd(fd, STDIN_FILENO, false);
501 }
502
503 case EXEC_INPUT_SOCKET:
504 assert(socket_fd >= 0);
505
506 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
507
508 case EXEC_INPUT_NAMED_FD:
509 assert(named_iofds[STDIN_FILENO] >= 0);
510
511 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
512 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
513
514 case EXEC_INPUT_DATA: {
515 int fd;
516
517 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
518 if (fd < 0)
519 return fd;
520
521 return move_fd(fd, STDIN_FILENO, false);
522 }
523
524 case EXEC_INPUT_FILE: {
525 bool rw;
526 int fd;
527
528 assert(context->stdio_file[STDIN_FILENO]);
529
530 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
531 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
532
533 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
534 if (fd < 0)
535 return fd;
536
537 return move_fd(fd, STDIN_FILENO, false);
538 }
539
540 default:
541 assert_not_reached("Unknown input type");
542 }
543 }
544
545 static bool can_inherit_stderr_from_stdout(
546 const ExecContext *context,
547 ExecOutput o,
548 ExecOutput e) {
549
550 assert(context);
551
552 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
553 * stderr fd */
554
555 if (e == EXEC_OUTPUT_INHERIT)
556 return true;
557 if (e != o)
558 return false;
559
560 if (e == EXEC_OUTPUT_NAMED_FD)
561 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
562
563 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
564 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
565
566 return true;
567 }
568
569 static int setup_output(
570 const Unit *unit,
571 const ExecContext *context,
572 const ExecParameters *params,
573 int fileno,
574 int socket_fd,
575 const int named_iofds[static 3],
576 const char *ident,
577 uid_t uid,
578 gid_t gid,
579 dev_t *journal_stream_dev,
580 ino_t *journal_stream_ino) {
581
582 ExecOutput o;
583 ExecInput i;
584 int r;
585
586 assert(unit);
587 assert(context);
588 assert(params);
589 assert(ident);
590 assert(journal_stream_dev);
591 assert(journal_stream_ino);
592
593 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
594
595 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
596 return -errno;
597
598 return STDOUT_FILENO;
599 }
600
601 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
602 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
603 return -errno;
604
605 return STDERR_FILENO;
606 }
607
608 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
609 o = fixup_output(context->std_output, socket_fd);
610
611 if (fileno == STDERR_FILENO) {
612 ExecOutput e;
613 e = fixup_output(context->std_error, socket_fd);
614
615 /* This expects the input and output are already set up */
616
617 /* Don't change the stderr file descriptor if we inherit all
618 * the way and are not on a tty */
619 if (e == EXEC_OUTPUT_INHERIT &&
620 o == EXEC_OUTPUT_INHERIT &&
621 i == EXEC_INPUT_NULL &&
622 !is_terminal_input(context->std_input) &&
623 getppid () != 1)
624 return fileno;
625
626 /* Duplicate from stdout if possible */
627 if (can_inherit_stderr_from_stdout(context, o, e))
628 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
629
630 o = e;
631
632 } else if (o == EXEC_OUTPUT_INHERIT) {
633 /* If input got downgraded, inherit the original value */
634 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
635 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
636
637 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
638 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
639 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
640
641 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
642 if (getppid() != 1)
643 return fileno;
644
645 /* We need to open /dev/null here anew, to get the right access mode. */
646 return open_null_as(O_WRONLY, fileno);
647 }
648
649 switch (o) {
650
651 case EXEC_OUTPUT_NULL:
652 return open_null_as(O_WRONLY, fileno);
653
654 case EXEC_OUTPUT_TTY:
655 if (is_terminal_input(i))
656 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
657
658 /* We don't reset the terminal if this is just about output */
659 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
660
661 case EXEC_OUTPUT_SYSLOG:
662 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
663 case EXEC_OUTPUT_KMSG:
664 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
665 case EXEC_OUTPUT_JOURNAL:
666 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
667 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
668 if (r < 0) {
669 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
670 r = open_null_as(O_WRONLY, fileno);
671 } else {
672 struct stat st;
673
674 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
675 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
676 * services to detect whether they are connected to the journal or not.
677 *
678 * If both stdout and stderr are connected to a stream then let's make sure to store the data
679 * about STDERR as that's usually the best way to do logging. */
680
681 if (fstat(fileno, &st) >= 0 &&
682 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
683 *journal_stream_dev = st.st_dev;
684 *journal_stream_ino = st.st_ino;
685 }
686 }
687 return r;
688
689 case EXEC_OUTPUT_SOCKET:
690 assert(socket_fd >= 0);
691
692 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
693
694 case EXEC_OUTPUT_NAMED_FD:
695 assert(named_iofds[fileno] >= 0);
696
697 (void) fd_nonblock(named_iofds[fileno], false);
698 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
699
700 case EXEC_OUTPUT_FILE:
701 case EXEC_OUTPUT_FILE_APPEND: {
702 bool rw;
703 int fd, flags;
704
705 assert(context->stdio_file[fileno]);
706
707 rw = context->std_input == EXEC_INPUT_FILE &&
708 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
709
710 if (rw)
711 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
712
713 flags = O_WRONLY;
714 if (o == EXEC_OUTPUT_FILE_APPEND)
715 flags |= O_APPEND;
716
717 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
718 if (fd < 0)
719 return fd;
720
721 return move_fd(fd, fileno, 0);
722 }
723
724 default:
725 assert_not_reached("Unknown error type");
726 }
727 }
728
729 static int chown_terminal(int fd, uid_t uid) {
730 int r;
731
732 assert(fd >= 0);
733
734 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
735 if (isatty(fd) < 1) {
736 if (IN_SET(errno, EINVAL, ENOTTY))
737 return 0; /* not a tty */
738
739 return -errno;
740 }
741
742 /* This might fail. What matters are the results. */
743 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
744 if (r < 0)
745 return r;
746
747 return 1;
748 }
749
750 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
751 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
752 int r;
753
754 assert(_saved_stdin);
755 assert(_saved_stdout);
756
757 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
758 if (saved_stdin < 0)
759 return -errno;
760
761 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
762 if (saved_stdout < 0)
763 return -errno;
764
765 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
766 if (fd < 0)
767 return fd;
768
769 r = chown_terminal(fd, getuid());
770 if (r < 0)
771 return r;
772
773 r = reset_terminal_fd(fd, true);
774 if (r < 0)
775 return r;
776
777 r = rearrange_stdio(fd, fd, STDERR_FILENO);
778 fd = -1;
779 if (r < 0)
780 return r;
781
782 *_saved_stdin = saved_stdin;
783 *_saved_stdout = saved_stdout;
784
785 saved_stdin = saved_stdout = -1;
786
787 return 0;
788 }
789
790 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
791 assert(err < 0);
792
793 if (err == -ETIMEDOUT)
794 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
795 else {
796 errno = -err;
797 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
798 }
799 }
800
801 static void write_confirm_error(int err, const char *vc, const Unit *u) {
802 _cleanup_close_ int fd = -1;
803
804 assert(vc);
805
806 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
807 if (fd < 0)
808 return;
809
810 write_confirm_error_fd(err, fd, u);
811 }
812
813 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
814 int r = 0;
815
816 assert(saved_stdin);
817 assert(saved_stdout);
818
819 release_terminal();
820
821 if (*saved_stdin >= 0)
822 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
823 r = -errno;
824
825 if (*saved_stdout >= 0)
826 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
827 r = -errno;
828
829 *saved_stdin = safe_close(*saved_stdin);
830 *saved_stdout = safe_close(*saved_stdout);
831
832 return r;
833 }
834
835 enum {
836 CONFIRM_PRETEND_FAILURE = -1,
837 CONFIRM_PRETEND_SUCCESS = 0,
838 CONFIRM_EXECUTE = 1,
839 };
840
841 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
842 int saved_stdout = -1, saved_stdin = -1, r;
843 _cleanup_free_ char *e = NULL;
844 char c;
845
846 /* For any internal errors, assume a positive response. */
847 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
848 if (r < 0) {
849 write_confirm_error(r, vc, u);
850 return CONFIRM_EXECUTE;
851 }
852
853 /* confirm_spawn might have been disabled while we were sleeping. */
854 if (manager_is_confirm_spawn_disabled(u->manager)) {
855 r = 1;
856 goto restore_stdio;
857 }
858
859 e = ellipsize(cmdline, 60, 100);
860 if (!e) {
861 log_oom();
862 r = CONFIRM_EXECUTE;
863 goto restore_stdio;
864 }
865
866 for (;;) {
867 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
868 if (r < 0) {
869 write_confirm_error_fd(r, STDOUT_FILENO, u);
870 r = CONFIRM_EXECUTE;
871 goto restore_stdio;
872 }
873
874 switch (c) {
875 case 'c':
876 printf("Resuming normal execution.\n");
877 manager_disable_confirm_spawn();
878 r = 1;
879 break;
880 case 'D':
881 unit_dump(u, stdout, " ");
882 continue; /* ask again */
883 case 'f':
884 printf("Failing execution.\n");
885 r = CONFIRM_PRETEND_FAILURE;
886 break;
887 case 'h':
888 printf(" c - continue, proceed without asking anymore\n"
889 " D - dump, show the state of the unit\n"
890 " f - fail, don't execute the command and pretend it failed\n"
891 " h - help\n"
892 " i - info, show a short summary of the unit\n"
893 " j - jobs, show jobs that are in progress\n"
894 " s - skip, don't execute the command and pretend it succeeded\n"
895 " y - yes, execute the command\n");
896 continue; /* ask again */
897 case 'i':
898 printf(" Description: %s\n"
899 " Unit: %s\n"
900 " Command: %s\n",
901 u->id, u->description, cmdline);
902 continue; /* ask again */
903 case 'j':
904 manager_dump_jobs(u->manager, stdout, " ");
905 continue; /* ask again */
906 case 'n':
907 /* 'n' was removed in favor of 'f'. */
908 printf("Didn't understand 'n', did you mean 'f'?\n");
909 continue; /* ask again */
910 case 's':
911 printf("Skipping execution.\n");
912 r = CONFIRM_PRETEND_SUCCESS;
913 break;
914 case 'y':
915 r = CONFIRM_EXECUTE;
916 break;
917 default:
918 assert_not_reached("Unhandled choice");
919 }
920 break;
921 }
922
923 restore_stdio:
924 restore_confirm_stdio(&saved_stdin, &saved_stdout);
925 return r;
926 }
927
928 static int get_fixed_user(const ExecContext *c, const char **user,
929 uid_t *uid, gid_t *gid,
930 const char **home, const char **shell) {
931 int r;
932 const char *name;
933
934 assert(c);
935
936 if (!c->user)
937 return 0;
938
939 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
940 * (i.e. are "/" or "/bin/nologin"). */
941
942 name = c->user;
943 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
944 if (r < 0)
945 return r;
946
947 *user = name;
948 return 0;
949 }
950
951 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
952 int r;
953 const char *name;
954
955 assert(c);
956
957 if (!c->group)
958 return 0;
959
960 name = c->group;
961 r = get_group_creds(&name, gid, 0);
962 if (r < 0)
963 return r;
964
965 *group = name;
966 return 0;
967 }
968
969 static int get_supplementary_groups(const ExecContext *c, const char *user,
970 const char *group, gid_t gid,
971 gid_t **supplementary_gids, int *ngids) {
972 char **i;
973 int r, k = 0;
974 int ngroups_max;
975 bool keep_groups = false;
976 gid_t *groups = NULL;
977 _cleanup_free_ gid_t *l_gids = NULL;
978
979 assert(c);
980
981 /*
982 * If user is given, then lookup GID and supplementary groups list.
983 * We avoid NSS lookups for gid=0. Also we have to initialize groups
984 * here and as early as possible so we keep the list of supplementary
985 * groups of the caller.
986 */
987 if (user && gid_is_valid(gid) && gid != 0) {
988 /* First step, initialize groups from /etc/groups */
989 if (initgroups(user, gid) < 0)
990 return -errno;
991
992 keep_groups = true;
993 }
994
995 if (strv_isempty(c->supplementary_groups))
996 return 0;
997
998 /*
999 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1000 * be positive, otherwise fail.
1001 */
1002 errno = 0;
1003 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1004 if (ngroups_max <= 0)
1005 return errno_or_else(EOPNOTSUPP);
1006
1007 l_gids = new(gid_t, ngroups_max);
1008 if (!l_gids)
1009 return -ENOMEM;
1010
1011 if (keep_groups) {
1012 /*
1013 * Lookup the list of groups that the user belongs to, we
1014 * avoid NSS lookups here too for gid=0.
1015 */
1016 k = ngroups_max;
1017 if (getgrouplist(user, gid, l_gids, &k) < 0)
1018 return -EINVAL;
1019 } else
1020 k = 0;
1021
1022 STRV_FOREACH(i, c->supplementary_groups) {
1023 const char *g;
1024
1025 if (k >= ngroups_max)
1026 return -E2BIG;
1027
1028 g = *i;
1029 r = get_group_creds(&g, l_gids+k, 0);
1030 if (r < 0)
1031 return r;
1032
1033 k++;
1034 }
1035
1036 /*
1037 * Sets ngids to zero to drop all supplementary groups, happens
1038 * when we are under root and SupplementaryGroups= is empty.
1039 */
1040 if (k == 0) {
1041 *ngids = 0;
1042 return 0;
1043 }
1044
1045 /* Otherwise get the final list of supplementary groups */
1046 groups = memdup(l_gids, sizeof(gid_t) * k);
1047 if (!groups)
1048 return -ENOMEM;
1049
1050 *supplementary_gids = groups;
1051 *ngids = k;
1052
1053 groups = NULL;
1054
1055 return 0;
1056 }
1057
1058 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1059 int r;
1060
1061 /* Handle SupplementaryGroups= if it is not empty */
1062 if (ngids > 0) {
1063 r = maybe_setgroups(ngids, supplementary_gids);
1064 if (r < 0)
1065 return r;
1066 }
1067
1068 if (gid_is_valid(gid)) {
1069 /* Then set our gids */
1070 if (setresgid(gid, gid, gid) < 0)
1071 return -errno;
1072 }
1073
1074 return 0;
1075 }
1076
1077 static int enforce_user(const ExecContext *context, uid_t uid) {
1078 assert(context);
1079
1080 if (!uid_is_valid(uid))
1081 return 0;
1082
1083 /* Sets (but doesn't look up) the uid and make sure we keep the
1084 * capabilities while doing so. */
1085
1086 if (context->capability_ambient_set != 0) {
1087
1088 /* First step: If we need to keep capabilities but
1089 * drop privileges we need to make sure we keep our
1090 * caps, while we drop privileges. */
1091 if (uid != 0) {
1092 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1093
1094 if (prctl(PR_GET_SECUREBITS) != sb)
1095 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1096 return -errno;
1097 }
1098 }
1099
1100 /* Second step: actually set the uids */
1101 if (setresuid(uid, uid, uid) < 0)
1102 return -errno;
1103
1104 /* At this point we should have all necessary capabilities but
1105 are otherwise a normal user. However, the caps might got
1106 corrupted due to the setresuid() so we need clean them up
1107 later. This is done outside of this call. */
1108
1109 return 0;
1110 }
1111
1112 #if HAVE_PAM
1113
1114 static int null_conv(
1115 int num_msg,
1116 const struct pam_message **msg,
1117 struct pam_response **resp,
1118 void *appdata_ptr) {
1119
1120 /* We don't support conversations */
1121
1122 return PAM_CONV_ERR;
1123 }
1124
1125 #endif
1126
1127 static int setup_pam(
1128 const char *name,
1129 const char *user,
1130 uid_t uid,
1131 gid_t gid,
1132 const char *tty,
1133 char ***env,
1134 int fds[], size_t n_fds) {
1135
1136 #if HAVE_PAM
1137
1138 static const struct pam_conv conv = {
1139 .conv = null_conv,
1140 .appdata_ptr = NULL
1141 };
1142
1143 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1144 pam_handle_t *handle = NULL;
1145 sigset_t old_ss;
1146 int pam_code = PAM_SUCCESS, r;
1147 char **nv, **e = NULL;
1148 bool close_session = false;
1149 pid_t pam_pid = 0, parent_pid;
1150 int flags = 0;
1151
1152 assert(name);
1153 assert(user);
1154 assert(env);
1155
1156 /* We set up PAM in the parent process, then fork. The child
1157 * will then stay around until killed via PR_GET_PDEATHSIG or
1158 * systemd via the cgroup logic. It will then remove the PAM
1159 * session again. The parent process will exec() the actual
1160 * daemon. We do things this way to ensure that the main PID
1161 * of the daemon is the one we initially fork()ed. */
1162
1163 r = barrier_create(&barrier);
1164 if (r < 0)
1165 goto fail;
1166
1167 if (log_get_max_level() < LOG_DEBUG)
1168 flags |= PAM_SILENT;
1169
1170 pam_code = pam_start(name, user, &conv, &handle);
1171 if (pam_code != PAM_SUCCESS) {
1172 handle = NULL;
1173 goto fail;
1174 }
1175
1176 if (!tty) {
1177 _cleanup_free_ char *q = NULL;
1178
1179 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1180 * out if that's the case, and read the TTY off it. */
1181
1182 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1183 tty = strjoina("/dev/", q);
1184 }
1185
1186 if (tty) {
1187 pam_code = pam_set_item(handle, PAM_TTY, tty);
1188 if (pam_code != PAM_SUCCESS)
1189 goto fail;
1190 }
1191
1192 STRV_FOREACH(nv, *env) {
1193 pam_code = pam_putenv(handle, *nv);
1194 if (pam_code != PAM_SUCCESS)
1195 goto fail;
1196 }
1197
1198 pam_code = pam_acct_mgmt(handle, flags);
1199 if (pam_code != PAM_SUCCESS)
1200 goto fail;
1201
1202 pam_code = pam_open_session(handle, flags);
1203 if (pam_code != PAM_SUCCESS)
1204 goto fail;
1205
1206 close_session = true;
1207
1208 e = pam_getenvlist(handle);
1209 if (!e) {
1210 pam_code = PAM_BUF_ERR;
1211 goto fail;
1212 }
1213
1214 /* Block SIGTERM, so that we know that it won't get lost in
1215 * the child */
1216
1217 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1218
1219 parent_pid = getpid_cached();
1220
1221 r = safe_fork("(sd-pam)", 0, &pam_pid);
1222 if (r < 0)
1223 goto fail;
1224 if (r == 0) {
1225 int sig, ret = EXIT_PAM;
1226
1227 /* The child's job is to reset the PAM session on
1228 * termination */
1229 barrier_set_role(&barrier, BARRIER_CHILD);
1230
1231 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1232 * are open here that have been opened by PAM. */
1233 (void) close_many(fds, n_fds);
1234
1235 /* Drop privileges - we don't need any to pam_close_session
1236 * and this will make PR_SET_PDEATHSIG work in most cases.
1237 * If this fails, ignore the error - but expect sd-pam threads
1238 * to fail to exit normally */
1239
1240 r = maybe_setgroups(0, NULL);
1241 if (r < 0)
1242 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1243 if (setresgid(gid, gid, gid) < 0)
1244 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1245 if (setresuid(uid, uid, uid) < 0)
1246 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1247
1248 (void) ignore_signals(SIGPIPE, -1);
1249
1250 /* Wait until our parent died. This will only work if
1251 * the above setresuid() succeeds, otherwise the kernel
1252 * will not allow unprivileged parents kill their privileged
1253 * children this way. We rely on the control groups kill logic
1254 * to do the rest for us. */
1255 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1256 goto child_finish;
1257
1258 /* Tell the parent that our setup is done. This is especially
1259 * important regarding dropping privileges. Otherwise, unit
1260 * setup might race against our setresuid(2) call.
1261 *
1262 * If the parent aborted, we'll detect this below, hence ignore
1263 * return failure here. */
1264 (void) barrier_place(&barrier);
1265
1266 /* Check if our parent process might already have died? */
1267 if (getppid() == parent_pid) {
1268 sigset_t ss;
1269
1270 assert_se(sigemptyset(&ss) >= 0);
1271 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1272
1273 for (;;) {
1274 if (sigwait(&ss, &sig) < 0) {
1275 if (errno == EINTR)
1276 continue;
1277
1278 goto child_finish;
1279 }
1280
1281 assert(sig == SIGTERM);
1282 break;
1283 }
1284 }
1285
1286 /* If our parent died we'll end the session */
1287 if (getppid() != parent_pid) {
1288 pam_code = pam_close_session(handle, flags);
1289 if (pam_code != PAM_SUCCESS)
1290 goto child_finish;
1291 }
1292
1293 ret = 0;
1294
1295 child_finish:
1296 pam_end(handle, pam_code | flags);
1297 _exit(ret);
1298 }
1299
1300 barrier_set_role(&barrier, BARRIER_PARENT);
1301
1302 /* If the child was forked off successfully it will do all the
1303 * cleanups, so forget about the handle here. */
1304 handle = NULL;
1305
1306 /* Unblock SIGTERM again in the parent */
1307 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1308
1309 /* We close the log explicitly here, since the PAM modules
1310 * might have opened it, but we don't want this fd around. */
1311 closelog();
1312
1313 /* Synchronously wait for the child to initialize. We don't care for
1314 * errors as we cannot recover. However, warn loudly if it happens. */
1315 if (!barrier_place_and_sync(&barrier))
1316 log_error("PAM initialization failed");
1317
1318 return strv_free_and_replace(*env, e);
1319
1320 fail:
1321 if (pam_code != PAM_SUCCESS) {
1322 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1323 r = -EPERM; /* PAM errors do not map to errno */
1324 } else
1325 log_error_errno(r, "PAM failed: %m");
1326
1327 if (handle) {
1328 if (close_session)
1329 pam_code = pam_close_session(handle, flags);
1330
1331 pam_end(handle, pam_code | flags);
1332 }
1333
1334 strv_free(e);
1335 closelog();
1336
1337 return r;
1338 #else
1339 return 0;
1340 #endif
1341 }
1342
1343 static void rename_process_from_path(const char *path) {
1344 char process_name[11];
1345 const char *p;
1346 size_t l;
1347
1348 /* This resulting string must fit in 10 chars (i.e. the length
1349 * of "/sbin/init") to look pretty in /bin/ps */
1350
1351 p = basename(path);
1352 if (isempty(p)) {
1353 rename_process("(...)");
1354 return;
1355 }
1356
1357 l = strlen(p);
1358 if (l > 8) {
1359 /* The end of the process name is usually more
1360 * interesting, since the first bit might just be
1361 * "systemd-" */
1362 p = p + l - 8;
1363 l = 8;
1364 }
1365
1366 process_name[0] = '(';
1367 memcpy(process_name+1, p, l);
1368 process_name[1+l] = ')';
1369 process_name[1+l+1] = 0;
1370
1371 rename_process(process_name);
1372 }
1373
1374 static bool context_has_address_families(const ExecContext *c) {
1375 assert(c);
1376
1377 return c->address_families_whitelist ||
1378 !set_isempty(c->address_families);
1379 }
1380
1381 static bool context_has_syscall_filters(const ExecContext *c) {
1382 assert(c);
1383
1384 return c->syscall_whitelist ||
1385 !hashmap_isempty(c->syscall_filter);
1386 }
1387
1388 static bool context_has_no_new_privileges(const ExecContext *c) {
1389 assert(c);
1390
1391 if (c->no_new_privileges)
1392 return true;
1393
1394 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1395 return false;
1396
1397 /* We need NNP if we have any form of seccomp and are unprivileged */
1398 return context_has_address_families(c) ||
1399 c->memory_deny_write_execute ||
1400 c->restrict_realtime ||
1401 c->restrict_suid_sgid ||
1402 exec_context_restrict_namespaces_set(c) ||
1403 c->protect_kernel_tunables ||
1404 c->protect_kernel_modules ||
1405 c->private_devices ||
1406 context_has_syscall_filters(c) ||
1407 !set_isempty(c->syscall_archs) ||
1408 c->lock_personality ||
1409 c->protect_hostname;
1410 }
1411
1412 #if HAVE_SECCOMP
1413
1414 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1415
1416 if (is_seccomp_available())
1417 return false;
1418
1419 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1420 return true;
1421 }
1422
1423 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1424 uint32_t negative_action, default_action, action;
1425 int r;
1426
1427 assert(u);
1428 assert(c);
1429
1430 if (!context_has_syscall_filters(c))
1431 return 0;
1432
1433 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1434 return 0;
1435
1436 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1437
1438 if (c->syscall_whitelist) {
1439 default_action = negative_action;
1440 action = SCMP_ACT_ALLOW;
1441 } else {
1442 default_action = SCMP_ACT_ALLOW;
1443 action = negative_action;
1444 }
1445
1446 if (needs_ambient_hack) {
1447 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1448 if (r < 0)
1449 return r;
1450 }
1451
1452 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1453 }
1454
1455 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1456 assert(u);
1457 assert(c);
1458
1459 if (set_isempty(c->syscall_archs))
1460 return 0;
1461
1462 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1463 return 0;
1464
1465 return seccomp_restrict_archs(c->syscall_archs);
1466 }
1467
1468 static int apply_address_families(const Unit* u, const ExecContext *c) {
1469 assert(u);
1470 assert(c);
1471
1472 if (!context_has_address_families(c))
1473 return 0;
1474
1475 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1476 return 0;
1477
1478 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1479 }
1480
1481 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1482 assert(u);
1483 assert(c);
1484
1485 if (!c->memory_deny_write_execute)
1486 return 0;
1487
1488 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1489 return 0;
1490
1491 return seccomp_memory_deny_write_execute();
1492 }
1493
1494 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1495 assert(u);
1496 assert(c);
1497
1498 if (!c->restrict_realtime)
1499 return 0;
1500
1501 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1502 return 0;
1503
1504 return seccomp_restrict_realtime();
1505 }
1506
1507 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1508 assert(u);
1509 assert(c);
1510
1511 if (!c->restrict_suid_sgid)
1512 return 0;
1513
1514 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1515 return 0;
1516
1517 return seccomp_restrict_suid_sgid();
1518 }
1519
1520 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1521 assert(u);
1522 assert(c);
1523
1524 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1525 * let's protect even those systems where this is left on in the kernel. */
1526
1527 if (!c->protect_kernel_tunables)
1528 return 0;
1529
1530 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1531 return 0;
1532
1533 return seccomp_protect_sysctl();
1534 }
1535
1536 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1537 assert(u);
1538 assert(c);
1539
1540 /* Turn off module syscalls on ProtectKernelModules=yes */
1541
1542 if (!c->protect_kernel_modules)
1543 return 0;
1544
1545 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1546 return 0;
1547
1548 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1549 }
1550
1551 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1552 assert(u);
1553 assert(c);
1554
1555 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1556
1557 if (!c->private_devices)
1558 return 0;
1559
1560 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1561 return 0;
1562
1563 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1564 }
1565
1566 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1567 assert(u);
1568 assert(c);
1569
1570 if (!exec_context_restrict_namespaces_set(c))
1571 return 0;
1572
1573 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1574 return 0;
1575
1576 return seccomp_restrict_namespaces(c->restrict_namespaces);
1577 }
1578
1579 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1580 unsigned long personality;
1581 int r;
1582
1583 assert(u);
1584 assert(c);
1585
1586 if (!c->lock_personality)
1587 return 0;
1588
1589 if (skip_seccomp_unavailable(u, "LockPersonality="))
1590 return 0;
1591
1592 personality = c->personality;
1593
1594 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1595 if (personality == PERSONALITY_INVALID) {
1596
1597 r = opinionated_personality(&personality);
1598 if (r < 0)
1599 return r;
1600 }
1601
1602 return seccomp_lock_personality(personality);
1603 }
1604
1605 #endif
1606
1607 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1608 assert(idle_pipe);
1609
1610 idle_pipe[1] = safe_close(idle_pipe[1]);
1611 idle_pipe[2] = safe_close(idle_pipe[2]);
1612
1613 if (idle_pipe[0] >= 0) {
1614 int r;
1615
1616 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1617
1618 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1619 ssize_t n;
1620
1621 /* Signal systemd that we are bored and want to continue. */
1622 n = write(idle_pipe[3], "x", 1);
1623 if (n > 0)
1624 /* Wait for systemd to react to the signal above. */
1625 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1626 }
1627
1628 idle_pipe[0] = safe_close(idle_pipe[0]);
1629
1630 }
1631
1632 idle_pipe[3] = safe_close(idle_pipe[3]);
1633 }
1634
1635 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1636
1637 static int build_environment(
1638 const Unit *u,
1639 const ExecContext *c,
1640 const ExecParameters *p,
1641 size_t n_fds,
1642 const char *home,
1643 const char *username,
1644 const char *shell,
1645 dev_t journal_stream_dev,
1646 ino_t journal_stream_ino,
1647 char ***ret) {
1648
1649 _cleanup_strv_free_ char **our_env = NULL;
1650 ExecDirectoryType t;
1651 size_t n_env = 0;
1652 char *x;
1653
1654 assert(u);
1655 assert(c);
1656 assert(p);
1657 assert(ret);
1658
1659 our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1660 if (!our_env)
1661 return -ENOMEM;
1662
1663 if (n_fds > 0) {
1664 _cleanup_free_ char *joined = NULL;
1665
1666 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1667 return -ENOMEM;
1668 our_env[n_env++] = x;
1669
1670 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1671 return -ENOMEM;
1672 our_env[n_env++] = x;
1673
1674 joined = strv_join(p->fd_names, ":");
1675 if (!joined)
1676 return -ENOMEM;
1677
1678 x = strjoin("LISTEN_FDNAMES=", joined);
1679 if (!x)
1680 return -ENOMEM;
1681 our_env[n_env++] = x;
1682 }
1683
1684 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1685 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1686 return -ENOMEM;
1687 our_env[n_env++] = x;
1688
1689 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1690 return -ENOMEM;
1691 our_env[n_env++] = x;
1692 }
1693
1694 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1695 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1696 * check the database directly. */
1697 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1698 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1699 if (!x)
1700 return -ENOMEM;
1701 our_env[n_env++] = x;
1702 }
1703
1704 if (home) {
1705 x = strjoin("HOME=", home);
1706 if (!x)
1707 return -ENOMEM;
1708
1709 path_simplify(x + 5, true);
1710 our_env[n_env++] = x;
1711 }
1712
1713 if (username) {
1714 x = strjoin("LOGNAME=", username);
1715 if (!x)
1716 return -ENOMEM;
1717 our_env[n_env++] = x;
1718
1719 x = strjoin("USER=", username);
1720 if (!x)
1721 return -ENOMEM;
1722 our_env[n_env++] = x;
1723 }
1724
1725 if (shell) {
1726 x = strjoin("SHELL=", shell);
1727 if (!x)
1728 return -ENOMEM;
1729
1730 path_simplify(x + 6, true);
1731 our_env[n_env++] = x;
1732 }
1733
1734 if (!sd_id128_is_null(u->invocation_id)) {
1735 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1736 return -ENOMEM;
1737
1738 our_env[n_env++] = x;
1739 }
1740
1741 if (exec_context_needs_term(c)) {
1742 const char *tty_path, *term = NULL;
1743
1744 tty_path = exec_context_tty_path(c);
1745
1746 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1747 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1748 * passes to PID 1 ends up all the way in the console login shown. */
1749
1750 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1751 term = getenv("TERM");
1752 if (!term)
1753 term = default_term_for_tty(tty_path);
1754
1755 x = strjoin("TERM=", term);
1756 if (!x)
1757 return -ENOMEM;
1758 our_env[n_env++] = x;
1759 }
1760
1761 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1762 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1763 return -ENOMEM;
1764
1765 our_env[n_env++] = x;
1766 }
1767
1768 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1769 _cleanup_free_ char *pre = NULL, *joined = NULL;
1770 const char *n;
1771
1772 if (!p->prefix[t])
1773 continue;
1774
1775 if (strv_isempty(c->directories[t].paths))
1776 continue;
1777
1778 n = exec_directory_env_name_to_string(t);
1779 if (!n)
1780 continue;
1781
1782 pre = strjoin(p->prefix[t], "/");
1783 if (!pre)
1784 return -ENOMEM;
1785
1786 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1787 if (!joined)
1788 return -ENOMEM;
1789
1790 x = strjoin(n, "=", joined);
1791 if (!x)
1792 return -ENOMEM;
1793
1794 our_env[n_env++] = x;
1795 }
1796
1797 our_env[n_env++] = NULL;
1798 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1799
1800 *ret = TAKE_PTR(our_env);
1801
1802 return 0;
1803 }
1804
1805 static int build_pass_environment(const ExecContext *c, char ***ret) {
1806 _cleanup_strv_free_ char **pass_env = NULL;
1807 size_t n_env = 0, n_bufsize = 0;
1808 char **i;
1809
1810 STRV_FOREACH(i, c->pass_environment) {
1811 _cleanup_free_ char *x = NULL;
1812 char *v;
1813
1814 v = getenv(*i);
1815 if (!v)
1816 continue;
1817 x = strjoin(*i, "=", v);
1818 if (!x)
1819 return -ENOMEM;
1820
1821 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1822 return -ENOMEM;
1823
1824 pass_env[n_env++] = TAKE_PTR(x);
1825 pass_env[n_env] = NULL;
1826 }
1827
1828 *ret = TAKE_PTR(pass_env);
1829
1830 return 0;
1831 }
1832
1833 static bool exec_needs_mount_namespace(
1834 const ExecContext *context,
1835 const ExecParameters *params,
1836 const ExecRuntime *runtime) {
1837
1838 assert(context);
1839 assert(params);
1840
1841 if (context->root_image)
1842 return true;
1843
1844 if (!strv_isempty(context->read_write_paths) ||
1845 !strv_isempty(context->read_only_paths) ||
1846 !strv_isempty(context->inaccessible_paths))
1847 return true;
1848
1849 if (context->n_bind_mounts > 0)
1850 return true;
1851
1852 if (context->n_temporary_filesystems > 0)
1853 return true;
1854
1855 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1856 return true;
1857
1858 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1859 return true;
1860
1861 if (context->private_devices ||
1862 context->private_mounts ||
1863 context->protect_system != PROTECT_SYSTEM_NO ||
1864 context->protect_home != PROTECT_HOME_NO ||
1865 context->protect_kernel_tunables ||
1866 context->protect_kernel_modules ||
1867 context->protect_control_groups)
1868 return true;
1869
1870 if (context->root_directory) {
1871 ExecDirectoryType t;
1872
1873 if (context->mount_apivfs)
1874 return true;
1875
1876 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1877 if (!params->prefix[t])
1878 continue;
1879
1880 if (!strv_isempty(context->directories[t].paths))
1881 return true;
1882 }
1883 }
1884
1885 if (context->dynamic_user &&
1886 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1887 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1888 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1889 return true;
1890
1891 return false;
1892 }
1893
1894 static int setup_private_users(uid_t uid, gid_t gid) {
1895 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1896 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1897 _cleanup_close_ int unshare_ready_fd = -1;
1898 _cleanup_(sigkill_waitp) pid_t pid = 0;
1899 uint64_t c = 1;
1900 ssize_t n;
1901 int r;
1902
1903 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1904 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1905 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1906 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1907 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1908 * continues execution normally. */
1909
1910 if (uid != 0 && uid_is_valid(uid)) {
1911 r = asprintf(&uid_map,
1912 "0 0 1\n" /* Map root → root */
1913 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1914 uid, uid);
1915 if (r < 0)
1916 return -ENOMEM;
1917 } else {
1918 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1919 if (!uid_map)
1920 return -ENOMEM;
1921 }
1922
1923 if (gid != 0 && gid_is_valid(gid)) {
1924 r = asprintf(&gid_map,
1925 "0 0 1\n" /* Map root → root */
1926 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1927 gid, gid);
1928 if (r < 0)
1929 return -ENOMEM;
1930 } else {
1931 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1932 if (!gid_map)
1933 return -ENOMEM;
1934 }
1935
1936 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1937 * namespace. */
1938 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1939 if (unshare_ready_fd < 0)
1940 return -errno;
1941
1942 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1943 * failed. */
1944 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1945 return -errno;
1946
1947 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1948 if (r < 0)
1949 return r;
1950 if (r == 0) {
1951 _cleanup_close_ int fd = -1;
1952 const char *a;
1953 pid_t ppid;
1954
1955 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1956 * here, after the parent opened its own user namespace. */
1957
1958 ppid = getppid();
1959 errno_pipe[0] = safe_close(errno_pipe[0]);
1960
1961 /* Wait until the parent unshared the user namespace */
1962 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1963 r = -errno;
1964 goto child_fail;
1965 }
1966
1967 /* Disable the setgroups() system call in the child user namespace, for good. */
1968 a = procfs_file_alloca(ppid, "setgroups");
1969 fd = open(a, O_WRONLY|O_CLOEXEC);
1970 if (fd < 0) {
1971 if (errno != ENOENT) {
1972 r = -errno;
1973 goto child_fail;
1974 }
1975
1976 /* If the file is missing the kernel is too old, let's continue anyway. */
1977 } else {
1978 if (write(fd, "deny\n", 5) < 0) {
1979 r = -errno;
1980 goto child_fail;
1981 }
1982
1983 fd = safe_close(fd);
1984 }
1985
1986 /* First write the GID map */
1987 a = procfs_file_alloca(ppid, "gid_map");
1988 fd = open(a, O_WRONLY|O_CLOEXEC);
1989 if (fd < 0) {
1990 r = -errno;
1991 goto child_fail;
1992 }
1993 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1994 r = -errno;
1995 goto child_fail;
1996 }
1997 fd = safe_close(fd);
1998
1999 /* The write the UID map */
2000 a = procfs_file_alloca(ppid, "uid_map");
2001 fd = open(a, O_WRONLY|O_CLOEXEC);
2002 if (fd < 0) {
2003 r = -errno;
2004 goto child_fail;
2005 }
2006 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2007 r = -errno;
2008 goto child_fail;
2009 }
2010
2011 _exit(EXIT_SUCCESS);
2012
2013 child_fail:
2014 (void) write(errno_pipe[1], &r, sizeof(r));
2015 _exit(EXIT_FAILURE);
2016 }
2017
2018 errno_pipe[1] = safe_close(errno_pipe[1]);
2019
2020 if (unshare(CLONE_NEWUSER) < 0)
2021 return -errno;
2022
2023 /* Let the child know that the namespace is ready now */
2024 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2025 return -errno;
2026
2027 /* Try to read an error code from the child */
2028 n = read(errno_pipe[0], &r, sizeof(r));
2029 if (n < 0)
2030 return -errno;
2031 if (n == sizeof(r)) { /* an error code was sent to us */
2032 if (r < 0)
2033 return r;
2034 return -EIO;
2035 }
2036 if (n != 0) /* on success we should have read 0 bytes */
2037 return -EIO;
2038
2039 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2040 pid = 0;
2041 if (r < 0)
2042 return r;
2043 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2044 return -EIO;
2045
2046 return 0;
2047 }
2048
2049 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2050 if (!context->dynamic_user)
2051 return false;
2052
2053 if (type == EXEC_DIRECTORY_CONFIGURATION)
2054 return false;
2055
2056 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2057 return false;
2058
2059 return true;
2060 }
2061
2062 static int setup_exec_directory(
2063 const ExecContext *context,
2064 const ExecParameters *params,
2065 uid_t uid,
2066 gid_t gid,
2067 ExecDirectoryType type,
2068 int *exit_status) {
2069
2070 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2071 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2072 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2073 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2074 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2075 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2076 };
2077 char **rt;
2078 int r;
2079
2080 assert(context);
2081 assert(params);
2082 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2083 assert(exit_status);
2084
2085 if (!params->prefix[type])
2086 return 0;
2087
2088 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2089 if (!uid_is_valid(uid))
2090 uid = 0;
2091 if (!gid_is_valid(gid))
2092 gid = 0;
2093 }
2094
2095 STRV_FOREACH(rt, context->directories[type].paths) {
2096 _cleanup_free_ char *p = NULL, *pp = NULL;
2097
2098 p = path_join(params->prefix[type], *rt);
2099 if (!p) {
2100 r = -ENOMEM;
2101 goto fail;
2102 }
2103
2104 r = mkdir_parents_label(p, 0755);
2105 if (r < 0)
2106 goto fail;
2107
2108 if (exec_directory_is_private(context, type)) {
2109 _cleanup_free_ char *private_root = NULL;
2110
2111 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2112 * case we want to avoid leaving a directory around fully accessible that is owned by
2113 * a dynamic user whose UID is later on reused. To lock this down we use the same
2114 * trick used by container managers to prohibit host users to get access to files of
2115 * the same UID in containers: we place everything inside a directory that has an
2116 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2117 * for unprivileged host code. We then use fs namespacing to make this directory
2118 * permeable for the service itself.
2119 *
2120 * Specifically: for a service which wants a special directory "foo/" we first create
2121 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2122 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2123 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2124 * unprivileged host users can't look into it. Inside of the namespace of the unit
2125 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2126 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2127 * for the service and making sure it only gets access to the dirs it needs but no
2128 * others. Tricky? Yes, absolutely, but it works!
2129 *
2130 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2131 * to be owned by the service itself.
2132 *
2133 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2134 * for sharing files or sockets with other services. */
2135
2136 private_root = path_join(params->prefix[type], "private");
2137 if (!private_root) {
2138 r = -ENOMEM;
2139 goto fail;
2140 }
2141
2142 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2143 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2144 if (r < 0)
2145 goto fail;
2146
2147 pp = path_join(private_root, *rt);
2148 if (!pp) {
2149 r = -ENOMEM;
2150 goto fail;
2151 }
2152
2153 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2154 r = mkdir_parents_label(pp, 0755);
2155 if (r < 0)
2156 goto fail;
2157
2158 if (is_dir(p, false) > 0 &&
2159 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2160
2161 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2162 * it over. Most likely the service has been upgraded from one that didn't use
2163 * DynamicUser=1, to one that does. */
2164
2165 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2166 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2167 exec_directory_type_to_string(type), p, pp);
2168
2169 if (rename(p, pp) < 0) {
2170 r = -errno;
2171 goto fail;
2172 }
2173 } else {
2174 /* Otherwise, create the actual directory for the service */
2175
2176 r = mkdir_label(pp, context->directories[type].mode);
2177 if (r < 0 && r != -EEXIST)
2178 goto fail;
2179 }
2180
2181 /* And link it up from the original place */
2182 r = symlink_idempotent(pp, p, true);
2183 if (r < 0)
2184 goto fail;
2185
2186 } else {
2187 _cleanup_free_ char *target = NULL;
2188
2189 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2190 readlink_and_make_absolute(p, &target) >= 0) {
2191 _cleanup_free_ char *q = NULL;
2192
2193 /* This already exists and is a symlink? Interesting. Maybe it's one created
2194 * by DynamicUser=1 (see above)?
2195 *
2196 * We do this for all directory types except for ConfigurationDirectory=,
2197 * since they all support the private/ symlink logic at least in some
2198 * configurations, see above. */
2199
2200 q = path_join(params->prefix[type], "private", *rt);
2201 if (!q) {
2202 r = -ENOMEM;
2203 goto fail;
2204 }
2205
2206 if (path_equal(q, target)) {
2207
2208 /* Hmm, apparently DynamicUser= was once turned on for this service,
2209 * but is no longer. Let's move the directory back up. */
2210
2211 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2212 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2213 exec_directory_type_to_string(type), q, p);
2214
2215 if (unlink(p) < 0) {
2216 r = -errno;
2217 goto fail;
2218 }
2219
2220 if (rename(q, p) < 0) {
2221 r = -errno;
2222 goto fail;
2223 }
2224 }
2225 }
2226
2227 r = mkdir_label(p, context->directories[type].mode);
2228 if (r < 0) {
2229 if (r != -EEXIST)
2230 goto fail;
2231
2232 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2233 struct stat st;
2234
2235 /* Don't change the owner/access mode of the configuration directory,
2236 * as in the common case it is not written to by a service, and shall
2237 * not be writable. */
2238
2239 if (stat(p, &st) < 0) {
2240 r = -errno;
2241 goto fail;
2242 }
2243
2244 /* Still complain if the access mode doesn't match */
2245 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2246 log_warning("%s \'%s\' already exists but the mode is different. "
2247 "(File system: %o %sMode: %o)",
2248 exec_directory_type_to_string(type), *rt,
2249 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2250
2251 continue;
2252 }
2253 }
2254 }
2255
2256 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2257 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2258 * current UID/GID ownership.) */
2259 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2260 if (r < 0)
2261 goto fail;
2262
2263 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2264 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2265 * assignments to exist.*/
2266 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2267 if (r < 0)
2268 goto fail;
2269 }
2270
2271 return 0;
2272
2273 fail:
2274 *exit_status = exit_status_table[type];
2275 return r;
2276 }
2277
2278 #if ENABLE_SMACK
2279 static int setup_smack(
2280 const ExecContext *context,
2281 const ExecCommand *command) {
2282
2283 int r;
2284
2285 assert(context);
2286 assert(command);
2287
2288 if (context->smack_process_label) {
2289 r = mac_smack_apply_pid(0, context->smack_process_label);
2290 if (r < 0)
2291 return r;
2292 }
2293 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2294 else {
2295 _cleanup_free_ char *exec_label = NULL;
2296
2297 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2298 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2299 return r;
2300
2301 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2302 if (r < 0)
2303 return r;
2304 }
2305 #endif
2306
2307 return 0;
2308 }
2309 #endif
2310
2311 static int compile_bind_mounts(
2312 const ExecContext *context,
2313 const ExecParameters *params,
2314 BindMount **ret_bind_mounts,
2315 size_t *ret_n_bind_mounts,
2316 char ***ret_empty_directories) {
2317
2318 _cleanup_strv_free_ char **empty_directories = NULL;
2319 BindMount *bind_mounts;
2320 size_t n, h = 0, i;
2321 ExecDirectoryType t;
2322 int r;
2323
2324 assert(context);
2325 assert(params);
2326 assert(ret_bind_mounts);
2327 assert(ret_n_bind_mounts);
2328 assert(ret_empty_directories);
2329
2330 n = context->n_bind_mounts;
2331 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2332 if (!params->prefix[t])
2333 continue;
2334
2335 n += strv_length(context->directories[t].paths);
2336 }
2337
2338 if (n <= 0) {
2339 *ret_bind_mounts = NULL;
2340 *ret_n_bind_mounts = 0;
2341 *ret_empty_directories = NULL;
2342 return 0;
2343 }
2344
2345 bind_mounts = new(BindMount, n);
2346 if (!bind_mounts)
2347 return -ENOMEM;
2348
2349 for (i = 0; i < context->n_bind_mounts; i++) {
2350 BindMount *item = context->bind_mounts + i;
2351 char *s, *d;
2352
2353 s = strdup(item->source);
2354 if (!s) {
2355 r = -ENOMEM;
2356 goto finish;
2357 }
2358
2359 d = strdup(item->destination);
2360 if (!d) {
2361 free(s);
2362 r = -ENOMEM;
2363 goto finish;
2364 }
2365
2366 bind_mounts[h++] = (BindMount) {
2367 .source = s,
2368 .destination = d,
2369 .read_only = item->read_only,
2370 .recursive = item->recursive,
2371 .ignore_enoent = item->ignore_enoent,
2372 };
2373 }
2374
2375 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2376 char **suffix;
2377
2378 if (!params->prefix[t])
2379 continue;
2380
2381 if (strv_isempty(context->directories[t].paths))
2382 continue;
2383
2384 if (exec_directory_is_private(context, t) &&
2385 !(context->root_directory || context->root_image)) {
2386 char *private_root;
2387
2388 /* So this is for a dynamic user, and we need to make sure the process can access its own
2389 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2390 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2391
2392 private_root = path_join(params->prefix[t], "private");
2393 if (!private_root) {
2394 r = -ENOMEM;
2395 goto finish;
2396 }
2397
2398 r = strv_consume(&empty_directories, private_root);
2399 if (r < 0)
2400 goto finish;
2401 }
2402
2403 STRV_FOREACH(suffix, context->directories[t].paths) {
2404 char *s, *d;
2405
2406 if (exec_directory_is_private(context, t))
2407 s = path_join(params->prefix[t], "private", *suffix);
2408 else
2409 s = path_join(params->prefix[t], *suffix);
2410 if (!s) {
2411 r = -ENOMEM;
2412 goto finish;
2413 }
2414
2415 if (exec_directory_is_private(context, t) &&
2416 (context->root_directory || context->root_image))
2417 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2418 * directory is not created on the root directory. So, let's bind-mount the directory
2419 * on the 'non-private' place. */
2420 d = path_join(params->prefix[t], *suffix);
2421 else
2422 d = strdup(s);
2423 if (!d) {
2424 free(s);
2425 r = -ENOMEM;
2426 goto finish;
2427 }
2428
2429 bind_mounts[h++] = (BindMount) {
2430 .source = s,
2431 .destination = d,
2432 .read_only = false,
2433 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2434 .recursive = true,
2435 .ignore_enoent = false,
2436 };
2437 }
2438 }
2439
2440 assert(h == n);
2441
2442 *ret_bind_mounts = bind_mounts;
2443 *ret_n_bind_mounts = n;
2444 *ret_empty_directories = TAKE_PTR(empty_directories);
2445
2446 return (int) n;
2447
2448 finish:
2449 bind_mount_free_many(bind_mounts, h);
2450 return r;
2451 }
2452
2453 static int apply_mount_namespace(
2454 const Unit *u,
2455 const ExecCommand *command,
2456 const ExecContext *context,
2457 const ExecParameters *params,
2458 const ExecRuntime *runtime,
2459 char **error_path) {
2460
2461 _cleanup_strv_free_ char **empty_directories = NULL;
2462 char *tmp = NULL, *var = NULL;
2463 const char *root_dir = NULL, *root_image = NULL;
2464 NamespaceInfo ns_info;
2465 bool needs_sandboxing;
2466 BindMount *bind_mounts = NULL;
2467 size_t n_bind_mounts = 0;
2468 int r;
2469
2470 assert(context);
2471
2472 /* The runtime struct only contains the parent of the private /tmp,
2473 * which is non-accessible to world users. Inside of it there's a /tmp
2474 * that is sticky, and that's the one we want to use here. */
2475
2476 if (context->private_tmp && runtime) {
2477 if (runtime->tmp_dir)
2478 tmp = strjoina(runtime->tmp_dir, "/tmp");
2479 if (runtime->var_tmp_dir)
2480 var = strjoina(runtime->var_tmp_dir, "/tmp");
2481 }
2482
2483 if (params->flags & EXEC_APPLY_CHROOT) {
2484 root_image = context->root_image;
2485
2486 if (!root_image)
2487 root_dir = context->root_directory;
2488 }
2489
2490 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2491 if (r < 0)
2492 return r;
2493
2494 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2495 if (needs_sandboxing)
2496 ns_info = (NamespaceInfo) {
2497 .ignore_protect_paths = false,
2498 .private_dev = context->private_devices,
2499 .protect_control_groups = context->protect_control_groups,
2500 .protect_kernel_tunables = context->protect_kernel_tunables,
2501 .protect_kernel_modules = context->protect_kernel_modules,
2502 .protect_hostname = context->protect_hostname,
2503 .mount_apivfs = context->mount_apivfs,
2504 .private_mounts = context->private_mounts,
2505 };
2506 else if (!context->dynamic_user && root_dir)
2507 /*
2508 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2509 * sandbox info, otherwise enforce it, don't ignore protected paths and
2510 * fail if we are enable to apply the sandbox inside the mount namespace.
2511 */
2512 ns_info = (NamespaceInfo) {
2513 .ignore_protect_paths = true,
2514 };
2515 else
2516 ns_info = (NamespaceInfo) {};
2517
2518 if (context->mount_flags == MS_SHARED)
2519 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2520
2521 r = setup_namespace(root_dir, root_image,
2522 &ns_info, context->read_write_paths,
2523 needs_sandboxing ? context->read_only_paths : NULL,
2524 needs_sandboxing ? context->inaccessible_paths : NULL,
2525 empty_directories,
2526 bind_mounts,
2527 n_bind_mounts,
2528 context->temporary_filesystems,
2529 context->n_temporary_filesystems,
2530 tmp,
2531 var,
2532 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2533 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2534 context->mount_flags,
2535 DISSECT_IMAGE_DISCARD_ON_LOOP,
2536 error_path);
2537
2538 bind_mount_free_many(bind_mounts, n_bind_mounts);
2539
2540 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2541 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2542 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2543 * completely different execution environment. */
2544 if (r == -ENOANO) {
2545 if (n_bind_mounts == 0 &&
2546 context->n_temporary_filesystems == 0 &&
2547 !root_dir && !root_image &&
2548 !context->dynamic_user) {
2549 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2550 return 0;
2551 }
2552
2553 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2554 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2555 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2556
2557 return -EOPNOTSUPP;
2558 }
2559
2560 return r;
2561 }
2562
2563 static int apply_working_directory(
2564 const ExecContext *context,
2565 const ExecParameters *params,
2566 const char *home,
2567 int *exit_status) {
2568
2569 const char *d, *wd;
2570
2571 assert(context);
2572 assert(exit_status);
2573
2574 if (context->working_directory_home) {
2575
2576 if (!home) {
2577 *exit_status = EXIT_CHDIR;
2578 return -ENXIO;
2579 }
2580
2581 wd = home;
2582
2583 } else if (context->working_directory)
2584 wd = context->working_directory;
2585 else
2586 wd = "/";
2587
2588 if (params->flags & EXEC_APPLY_CHROOT)
2589 d = wd;
2590 else
2591 d = prefix_roota(context->root_directory, wd);
2592
2593 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2594 *exit_status = EXIT_CHDIR;
2595 return -errno;
2596 }
2597
2598 return 0;
2599 }
2600
2601 static int apply_root_directory(
2602 const ExecContext *context,
2603 const ExecParameters *params,
2604 const bool needs_mount_ns,
2605 int *exit_status) {
2606
2607 assert(context);
2608 assert(exit_status);
2609
2610 if (params->flags & EXEC_APPLY_CHROOT) {
2611 if (!needs_mount_ns && context->root_directory)
2612 if (chroot(context->root_directory) < 0) {
2613 *exit_status = EXIT_CHROOT;
2614 return -errno;
2615 }
2616 }
2617
2618 return 0;
2619 }
2620
2621 static int setup_keyring(
2622 const Unit *u,
2623 const ExecContext *context,
2624 const ExecParameters *p,
2625 uid_t uid, gid_t gid) {
2626
2627 key_serial_t keyring;
2628 int r = 0;
2629 uid_t saved_uid;
2630 gid_t saved_gid;
2631
2632 assert(u);
2633 assert(context);
2634 assert(p);
2635
2636 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2637 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2638 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2639 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2640 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2641 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2642
2643 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2644 return 0;
2645
2646 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2647 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2648 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2649 * & group is just as nasty as acquiring a reference to the user keyring. */
2650
2651 saved_uid = getuid();
2652 saved_gid = getgid();
2653
2654 if (gid_is_valid(gid) && gid != saved_gid) {
2655 if (setregid(gid, -1) < 0)
2656 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2657 }
2658
2659 if (uid_is_valid(uid) && uid != saved_uid) {
2660 if (setreuid(uid, -1) < 0) {
2661 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2662 goto out;
2663 }
2664 }
2665
2666 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2667 if (keyring == -1) {
2668 if (errno == ENOSYS)
2669 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2670 else if (IN_SET(errno, EACCES, EPERM))
2671 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2672 else if (errno == EDQUOT)
2673 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2674 else
2675 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2676
2677 goto out;
2678 }
2679
2680 /* When requested link the user keyring into the session keyring. */
2681 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2682
2683 if (keyctl(KEYCTL_LINK,
2684 KEY_SPEC_USER_KEYRING,
2685 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2686 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2687 goto out;
2688 }
2689 }
2690
2691 /* Restore uid/gid back */
2692 if (uid_is_valid(uid) && uid != saved_uid) {
2693 if (setreuid(saved_uid, -1) < 0) {
2694 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2695 goto out;
2696 }
2697 }
2698
2699 if (gid_is_valid(gid) && gid != saved_gid) {
2700 if (setregid(saved_gid, -1) < 0)
2701 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2702 }
2703
2704 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2705 if (!sd_id128_is_null(u->invocation_id)) {
2706 key_serial_t key;
2707
2708 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2709 if (key == -1)
2710 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2711 else {
2712 if (keyctl(KEYCTL_SETPERM, key,
2713 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2714 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2715 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2716 }
2717 }
2718
2719 out:
2720 /* Revert back uid & gid for the the last time, and exit */
2721 /* no extra logging, as only the first already reported error matters */
2722 if (getuid() != saved_uid)
2723 (void) setreuid(saved_uid, -1);
2724
2725 if (getgid() != saved_gid)
2726 (void) setregid(saved_gid, -1);
2727
2728 return r;
2729 }
2730
2731 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2732 assert(array);
2733 assert(n);
2734 assert(pair);
2735
2736 if (pair[0] >= 0)
2737 array[(*n)++] = pair[0];
2738 if (pair[1] >= 0)
2739 array[(*n)++] = pair[1];
2740 }
2741
2742 static int close_remaining_fds(
2743 const ExecParameters *params,
2744 const ExecRuntime *runtime,
2745 const DynamicCreds *dcreds,
2746 int user_lookup_fd,
2747 int socket_fd,
2748 int exec_fd,
2749 int *fds, size_t n_fds) {
2750
2751 size_t n_dont_close = 0;
2752 int dont_close[n_fds + 12];
2753
2754 assert(params);
2755
2756 if (params->stdin_fd >= 0)
2757 dont_close[n_dont_close++] = params->stdin_fd;
2758 if (params->stdout_fd >= 0)
2759 dont_close[n_dont_close++] = params->stdout_fd;
2760 if (params->stderr_fd >= 0)
2761 dont_close[n_dont_close++] = params->stderr_fd;
2762
2763 if (socket_fd >= 0)
2764 dont_close[n_dont_close++] = socket_fd;
2765 if (exec_fd >= 0)
2766 dont_close[n_dont_close++] = exec_fd;
2767 if (n_fds > 0) {
2768 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2769 n_dont_close += n_fds;
2770 }
2771
2772 if (runtime)
2773 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2774
2775 if (dcreds) {
2776 if (dcreds->user)
2777 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2778 if (dcreds->group)
2779 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2780 }
2781
2782 if (user_lookup_fd >= 0)
2783 dont_close[n_dont_close++] = user_lookup_fd;
2784
2785 return close_all_fds(dont_close, n_dont_close);
2786 }
2787
2788 static int send_user_lookup(
2789 Unit *unit,
2790 int user_lookup_fd,
2791 uid_t uid,
2792 gid_t gid) {
2793
2794 assert(unit);
2795
2796 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2797 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2798 * specified. */
2799
2800 if (user_lookup_fd < 0)
2801 return 0;
2802
2803 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2804 return 0;
2805
2806 if (writev(user_lookup_fd,
2807 (struct iovec[]) {
2808 IOVEC_INIT(&uid, sizeof(uid)),
2809 IOVEC_INIT(&gid, sizeof(gid)),
2810 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2811 return -errno;
2812
2813 return 0;
2814 }
2815
2816 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2817 int r;
2818
2819 assert(c);
2820 assert(home);
2821 assert(buf);
2822
2823 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2824
2825 if (*home)
2826 return 0;
2827
2828 if (!c->working_directory_home)
2829 return 0;
2830
2831 r = get_home_dir(buf);
2832 if (r < 0)
2833 return r;
2834
2835 *home = *buf;
2836 return 1;
2837 }
2838
2839 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2840 _cleanup_strv_free_ char ** list = NULL;
2841 ExecDirectoryType t;
2842 int r;
2843
2844 assert(c);
2845 assert(p);
2846 assert(ret);
2847
2848 assert(c->dynamic_user);
2849
2850 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2851 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2852 * directories. */
2853
2854 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2855 char **i;
2856
2857 if (t == EXEC_DIRECTORY_CONFIGURATION)
2858 continue;
2859
2860 if (!p->prefix[t])
2861 continue;
2862
2863 STRV_FOREACH(i, c->directories[t].paths) {
2864 char *e;
2865
2866 if (exec_directory_is_private(c, t))
2867 e = path_join(p->prefix[t], "private", *i);
2868 else
2869 e = path_join(p->prefix[t], *i);
2870 if (!e)
2871 return -ENOMEM;
2872
2873 r = strv_consume(&list, e);
2874 if (r < 0)
2875 return r;
2876 }
2877 }
2878
2879 *ret = TAKE_PTR(list);
2880
2881 return 0;
2882 }
2883
2884 static char *exec_command_line(char **argv);
2885
2886 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2887 bool using_subcgroup;
2888 char *p;
2889
2890 assert(params);
2891 assert(ret);
2892
2893 if (!params->cgroup_path)
2894 return -EINVAL;
2895
2896 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2897 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2898 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2899 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2900 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2901 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2902 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2903 * flag, which is only passed for the former statements, not for the latter. */
2904
2905 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2906 if (using_subcgroup)
2907 p = path_join(params->cgroup_path, ".control");
2908 else
2909 p = strdup(params->cgroup_path);
2910 if (!p)
2911 return -ENOMEM;
2912
2913 *ret = p;
2914 return using_subcgroup;
2915 }
2916
2917 static int exec_child(
2918 Unit *unit,
2919 const ExecCommand *command,
2920 const ExecContext *context,
2921 const ExecParameters *params,
2922 ExecRuntime *runtime,
2923 DynamicCreds *dcreds,
2924 int socket_fd,
2925 const int named_iofds[static 3],
2926 int *fds,
2927 size_t n_socket_fds,
2928 size_t n_storage_fds,
2929 char **files_env,
2930 int user_lookup_fd,
2931 int *exit_status) {
2932
2933 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
2934 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2935 _cleanup_free_ gid_t *supplementary_gids = NULL;
2936 const char *username = NULL, *groupname = NULL;
2937 _cleanup_free_ char *home_buffer = NULL;
2938 const char *home = NULL, *shell = NULL;
2939 char **final_argv = NULL;
2940 dev_t journal_stream_dev = 0;
2941 ino_t journal_stream_ino = 0;
2942 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2943 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2944 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2945 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2946 #if HAVE_SELINUX
2947 _cleanup_free_ char *mac_selinux_context_net = NULL;
2948 bool use_selinux = false;
2949 #endif
2950 #if ENABLE_SMACK
2951 bool use_smack = false;
2952 #endif
2953 #if HAVE_APPARMOR
2954 bool use_apparmor = false;
2955 #endif
2956 uid_t uid = UID_INVALID;
2957 gid_t gid = GID_INVALID;
2958 size_t n_fds;
2959 ExecDirectoryType dt;
2960 int secure_bits;
2961
2962 assert(unit);
2963 assert(command);
2964 assert(context);
2965 assert(params);
2966 assert(exit_status);
2967
2968 rename_process_from_path(command->path);
2969
2970 /* We reset exactly these signals, since they are the
2971 * only ones we set to SIG_IGN in the main daemon. All
2972 * others we leave untouched because we set them to
2973 * SIG_DFL or a valid handler initially, both of which
2974 * will be demoted to SIG_DFL. */
2975 (void) default_signals(SIGNALS_CRASH_HANDLER,
2976 SIGNALS_IGNORE, -1);
2977
2978 if (context->ignore_sigpipe)
2979 (void) ignore_signals(SIGPIPE, -1);
2980
2981 r = reset_signal_mask();
2982 if (r < 0) {
2983 *exit_status = EXIT_SIGNAL_MASK;
2984 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2985 }
2986
2987 if (params->idle_pipe)
2988 do_idle_pipe_dance(params->idle_pipe);
2989
2990 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2991 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2992 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2993 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2994
2995 log_forget_fds();
2996 log_set_open_when_needed(true);
2997
2998 /* In case anything used libc syslog(), close this here, too */
2999 closelog();
3000
3001 n_fds = n_socket_fds + n_storage_fds;
3002 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3003 if (r < 0) {
3004 *exit_status = EXIT_FDS;
3005 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3006 }
3007
3008 if (!context->same_pgrp)
3009 if (setsid() < 0) {
3010 *exit_status = EXIT_SETSID;
3011 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3012 }
3013
3014 exec_context_tty_reset(context, params);
3015
3016 if (unit_shall_confirm_spawn(unit)) {
3017 const char *vc = params->confirm_spawn;
3018 _cleanup_free_ char *cmdline = NULL;
3019
3020 cmdline = exec_command_line(command->argv);
3021 if (!cmdline) {
3022 *exit_status = EXIT_MEMORY;
3023 return log_oom();
3024 }
3025
3026 r = ask_for_confirmation(vc, unit, cmdline);
3027 if (r != CONFIRM_EXECUTE) {
3028 if (r == CONFIRM_PRETEND_SUCCESS) {
3029 *exit_status = EXIT_SUCCESS;
3030 return 0;
3031 }
3032 *exit_status = EXIT_CONFIRM;
3033 log_unit_error(unit, "Execution cancelled by the user");
3034 return -ECANCELED;
3035 }
3036 }
3037
3038 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3039 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3040 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3041 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3042 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3043 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3044 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3045 *exit_status = EXIT_MEMORY;
3046 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3047 }
3048
3049 if (context->dynamic_user && dcreds) {
3050 _cleanup_strv_free_ char **suggested_paths = NULL;
3051
3052 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3053 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3054 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3055 *exit_status = EXIT_USER;
3056 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3057 }
3058
3059 r = compile_suggested_paths(context, params, &suggested_paths);
3060 if (r < 0) {
3061 *exit_status = EXIT_MEMORY;
3062 return log_oom();
3063 }
3064
3065 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3066 if (r < 0) {
3067 *exit_status = EXIT_USER;
3068 if (r == -EILSEQ) {
3069 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3070 return -EOPNOTSUPP;
3071 }
3072 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3073 }
3074
3075 if (!uid_is_valid(uid)) {
3076 *exit_status = EXIT_USER;
3077 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3078 return -ESRCH;
3079 }
3080
3081 if (!gid_is_valid(gid)) {
3082 *exit_status = EXIT_USER;
3083 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3084 return -ESRCH;
3085 }
3086
3087 if (dcreds->user)
3088 username = dcreds->user->name;
3089
3090 } else {
3091 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3092 if (r < 0) {
3093 *exit_status = EXIT_USER;
3094 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3095 }
3096
3097 r = get_fixed_group(context, &groupname, &gid);
3098 if (r < 0) {
3099 *exit_status = EXIT_GROUP;
3100 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3101 }
3102 }
3103
3104 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3105 r = get_supplementary_groups(context, username, groupname, gid,
3106 &supplementary_gids, &ngids);
3107 if (r < 0) {
3108 *exit_status = EXIT_GROUP;
3109 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3110 }
3111
3112 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3113 if (r < 0) {
3114 *exit_status = EXIT_USER;
3115 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3116 }
3117
3118 user_lookup_fd = safe_close(user_lookup_fd);
3119
3120 r = acquire_home(context, uid, &home, &home_buffer);
3121 if (r < 0) {
3122 *exit_status = EXIT_CHDIR;
3123 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3124 }
3125
3126 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3127 * must sure to drop O_NONBLOCK */
3128 if (socket_fd >= 0)
3129 (void) fd_nonblock(socket_fd, false);
3130
3131 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3132 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3133 if (params->cgroup_path) {
3134 _cleanup_free_ char *p = NULL;
3135
3136 r = exec_parameters_get_cgroup_path(params, &p);
3137 if (r < 0) {
3138 *exit_status = EXIT_CGROUP;
3139 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3140 }
3141
3142 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3143 if (r < 0) {
3144 *exit_status = EXIT_CGROUP;
3145 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3146 }
3147 }
3148
3149 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3150 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3151 if (r < 0) {
3152 *exit_status = EXIT_NETWORK;
3153 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3154 }
3155 }
3156
3157 r = setup_input(context, params, socket_fd, named_iofds);
3158 if (r < 0) {
3159 *exit_status = EXIT_STDIN;
3160 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3161 }
3162
3163 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3164 if (r < 0) {
3165 *exit_status = EXIT_STDOUT;
3166 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3167 }
3168
3169 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3170 if (r < 0) {
3171 *exit_status = EXIT_STDERR;
3172 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3173 }
3174
3175 if (context->oom_score_adjust_set) {
3176 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3177 * prohibit write access to this file, and we shouldn't trip up over that. */
3178 r = set_oom_score_adjust(context->oom_score_adjust);
3179 if (IN_SET(r, -EPERM, -EACCES))
3180 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3181 else if (r < 0) {
3182 *exit_status = EXIT_OOM_ADJUST;
3183 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3184 }
3185 }
3186
3187 if (context->nice_set)
3188 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3189 *exit_status = EXIT_NICE;
3190 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3191 }
3192
3193 if (context->cpu_sched_set) {
3194 struct sched_param param = {
3195 .sched_priority = context->cpu_sched_priority,
3196 };
3197
3198 r = sched_setscheduler(0,
3199 context->cpu_sched_policy |
3200 (context->cpu_sched_reset_on_fork ?
3201 SCHED_RESET_ON_FORK : 0),
3202 &param);
3203 if (r < 0) {
3204 *exit_status = EXIT_SETSCHEDULER;
3205 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3206 }
3207 }
3208
3209 if (context->cpu_set.set)
3210 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3211 *exit_status = EXIT_CPUAFFINITY;
3212 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3213 }
3214
3215 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3216 r = apply_numa_policy(&context->numa_policy);
3217 if (r == -EOPNOTSUPP)
3218 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3219 else if (r < 0) {
3220 *exit_status = EXIT_NUMA_POLICY;
3221 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3222 }
3223 }
3224
3225 if (context->ioprio_set)
3226 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3227 *exit_status = EXIT_IOPRIO;
3228 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3229 }
3230
3231 if (context->timer_slack_nsec != NSEC_INFINITY)
3232 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3233 *exit_status = EXIT_TIMERSLACK;
3234 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3235 }
3236
3237 if (context->personality != PERSONALITY_INVALID) {
3238 r = safe_personality(context->personality);
3239 if (r < 0) {
3240 *exit_status = EXIT_PERSONALITY;
3241 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3242 }
3243 }
3244
3245 if (context->utmp_id)
3246 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3247 context->tty_path,
3248 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3249 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3250 USER_PROCESS,
3251 username);
3252
3253 if (uid_is_valid(uid)) {
3254 r = chown_terminal(STDIN_FILENO, uid);
3255 if (r < 0) {
3256 *exit_status = EXIT_STDIN;
3257 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3258 }
3259 }
3260
3261 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3262 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3263 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3264 * touch a single hierarchy too. */
3265 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3266 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3267 if (r < 0) {
3268 *exit_status = EXIT_CGROUP;
3269 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3270 }
3271 }
3272
3273 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3274 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3275 if (r < 0)
3276 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3277 }
3278
3279 r = build_environment(
3280 unit,
3281 context,
3282 params,
3283 n_fds,
3284 home,
3285 username,
3286 shell,
3287 journal_stream_dev,
3288 journal_stream_ino,
3289 &our_env);
3290 if (r < 0) {
3291 *exit_status = EXIT_MEMORY;
3292 return log_oom();
3293 }
3294
3295 r = build_pass_environment(context, &pass_env);
3296 if (r < 0) {
3297 *exit_status = EXIT_MEMORY;
3298 return log_oom();
3299 }
3300
3301 accum_env = strv_env_merge(5,
3302 params->environment,
3303 our_env,
3304 pass_env,
3305 context->environment,
3306 files_env,
3307 NULL);
3308 if (!accum_env) {
3309 *exit_status = EXIT_MEMORY;
3310 return log_oom();
3311 }
3312 accum_env = strv_env_clean(accum_env);
3313
3314 (void) umask(context->umask);
3315
3316 r = setup_keyring(unit, context, params, uid, gid);
3317 if (r < 0) {
3318 *exit_status = EXIT_KEYRING;
3319 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3320 }
3321
3322 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3323 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3324
3325 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3326 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3327
3328 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3329 if (needs_ambient_hack)
3330 needs_setuid = false;
3331 else
3332 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3333
3334 if (needs_sandboxing) {
3335 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3336 * present. The actual MAC context application will happen later, as late as possible, to avoid
3337 * impacting our own code paths. */
3338
3339 #if HAVE_SELINUX
3340 use_selinux = mac_selinux_use();
3341 #endif
3342 #if ENABLE_SMACK
3343 use_smack = mac_smack_use();
3344 #endif
3345 #if HAVE_APPARMOR
3346 use_apparmor = mac_apparmor_use();
3347 #endif
3348 }
3349
3350 if (needs_sandboxing) {
3351 int which_failed;
3352
3353 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3354 * is set here. (See below.) */
3355
3356 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3357 if (r < 0) {
3358 *exit_status = EXIT_LIMITS;
3359 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3360 }
3361 }
3362
3363 if (needs_setuid) {
3364
3365 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3366 * wins here. (See above.) */
3367
3368 if (context->pam_name && username) {
3369 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3370 if (r < 0) {
3371 *exit_status = EXIT_PAM;
3372 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3373 }
3374 }
3375 }
3376
3377 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3378
3379 if (ns_type_supported(NAMESPACE_NET)) {
3380 r = setup_netns(runtime->netns_storage_socket);
3381 if (r < 0) {
3382 *exit_status = EXIT_NETWORK;
3383 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3384 }
3385 } else if (context->network_namespace_path) {
3386 *exit_status = EXIT_NETWORK;
3387 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3388 } else
3389 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3390 }
3391
3392 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3393 if (needs_mount_namespace) {
3394 _cleanup_free_ char *error_path = NULL;
3395
3396 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3397 if (r < 0) {
3398 *exit_status = EXIT_NAMESPACE;
3399 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3400 error_path ? ": " : "", strempty(error_path));
3401 }
3402 }
3403
3404 if (context->protect_hostname) {
3405 if (ns_type_supported(NAMESPACE_UTS)) {
3406 if (unshare(CLONE_NEWUTS) < 0) {
3407 *exit_status = EXIT_NAMESPACE;
3408 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3409 }
3410 } else
3411 log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3412 #if HAVE_SECCOMP
3413 r = seccomp_protect_hostname();
3414 if (r < 0) {
3415 *exit_status = EXIT_SECCOMP;
3416 return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3417 }
3418 #endif
3419 }
3420
3421 /* Drop groups as early as possbile */
3422 if (needs_setuid) {
3423 r = enforce_groups(gid, supplementary_gids, ngids);
3424 if (r < 0) {
3425 *exit_status = EXIT_GROUP;
3426 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3427 }
3428 }
3429
3430 if (needs_sandboxing) {
3431 #if HAVE_SELINUX
3432 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3433 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3434 if (r < 0) {
3435 *exit_status = EXIT_SELINUX_CONTEXT;
3436 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3437 }
3438 }
3439 #endif
3440
3441 if (context->private_users) {
3442 r = setup_private_users(uid, gid);
3443 if (r < 0) {
3444 *exit_status = EXIT_USER;
3445 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3446 }
3447 }
3448 }
3449
3450 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3451 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3452 * however if we have it as we want to keep it open until the final execve(). */
3453
3454 if (params->exec_fd >= 0) {
3455 exec_fd = params->exec_fd;
3456
3457 if (exec_fd < 3 + (int) n_fds) {
3458 int moved_fd;
3459
3460 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3461 * process we are about to execute. */
3462
3463 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3464 if (moved_fd < 0) {
3465 *exit_status = EXIT_FDS;
3466 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3467 }
3468
3469 safe_close(exec_fd);
3470 exec_fd = moved_fd;
3471 } else {
3472 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3473 r = fd_cloexec(exec_fd, true);
3474 if (r < 0) {
3475 *exit_status = EXIT_FDS;
3476 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3477 }
3478 }
3479
3480 fds_with_exec_fd = newa(int, n_fds + 1);
3481 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3482 fds_with_exec_fd[n_fds] = exec_fd;
3483 n_fds_with_exec_fd = n_fds + 1;
3484 } else {
3485 fds_with_exec_fd = fds;
3486 n_fds_with_exec_fd = n_fds;
3487 }
3488
3489 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3490 if (r >= 0)
3491 r = shift_fds(fds, n_fds);
3492 if (r >= 0)
3493 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3494 if (r < 0) {
3495 *exit_status = EXIT_FDS;
3496 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3497 }
3498
3499 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3500 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3501 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3502 * came this far. */
3503
3504 secure_bits = context->secure_bits;
3505
3506 if (needs_sandboxing) {
3507 uint64_t bset;
3508
3509 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3510 * requested. (Note this is placed after the general resource limit initialization, see
3511 * above, in order to take precedence.) */
3512 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3513 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3514 *exit_status = EXIT_LIMITS;
3515 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3516 }
3517 }
3518
3519 #if ENABLE_SMACK
3520 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3521 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3522 if (use_smack) {
3523 r = setup_smack(context, command);
3524 if (r < 0) {
3525 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3526 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3527 }
3528 }
3529 #endif
3530
3531 bset = context->capability_bounding_set;
3532 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3533 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3534 * instead of us doing that */
3535 if (needs_ambient_hack)
3536 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3537 (UINT64_C(1) << CAP_SETUID) |
3538 (UINT64_C(1) << CAP_SETGID);
3539
3540 if (!cap_test_all(bset)) {
3541 r = capability_bounding_set_drop(bset, false);
3542 if (r < 0) {
3543 *exit_status = EXIT_CAPABILITIES;
3544 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3545 }
3546 }
3547
3548 /* This is done before enforce_user, but ambient set
3549 * does not survive over setresuid() if keep_caps is not set. */
3550 if (!needs_ambient_hack &&
3551 context->capability_ambient_set != 0) {
3552 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3553 if (r < 0) {
3554 *exit_status = EXIT_CAPABILITIES;
3555 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3556 }
3557 }
3558 }
3559
3560 /* chroot to root directory first, before we lose the ability to chroot */
3561 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3562 if (r < 0)
3563 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3564
3565 if (needs_setuid) {
3566 if (uid_is_valid(uid)) {
3567 r = enforce_user(context, uid);
3568 if (r < 0) {
3569 *exit_status = EXIT_USER;
3570 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3571 }
3572
3573 if (!needs_ambient_hack &&
3574 context->capability_ambient_set != 0) {
3575
3576 /* Fix the ambient capabilities after user change. */
3577 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3578 if (r < 0) {
3579 *exit_status = EXIT_CAPABILITIES;
3580 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3581 }
3582
3583 /* If we were asked to change user and ambient capabilities
3584 * were requested, we had to add keep-caps to the securebits
3585 * so that we would maintain the inherited capability set
3586 * through the setresuid(). Make sure that the bit is added
3587 * also to the context secure_bits so that we don't try to
3588 * drop the bit away next. */
3589
3590 secure_bits |= 1<<SECURE_KEEP_CAPS;
3591 }
3592 }
3593 }
3594
3595 /* Apply working directory here, because the working directory might be on NFS and only the user running
3596 * this service might have the correct privilege to change to the working directory */
3597 r = apply_working_directory(context, params, home, exit_status);
3598 if (r < 0)
3599 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3600
3601 if (needs_sandboxing) {
3602 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3603 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3604 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3605 * are restricted. */
3606
3607 #if HAVE_SELINUX
3608 if (use_selinux) {
3609 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3610
3611 if (exec_context) {
3612 r = setexeccon(exec_context);
3613 if (r < 0) {
3614 *exit_status = EXIT_SELINUX_CONTEXT;
3615 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3616 }
3617 }
3618 }
3619 #endif
3620
3621 #if HAVE_APPARMOR
3622 if (use_apparmor && context->apparmor_profile) {
3623 r = aa_change_onexec(context->apparmor_profile);
3624 if (r < 0 && !context->apparmor_profile_ignore) {
3625 *exit_status = EXIT_APPARMOR_PROFILE;
3626 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3627 }
3628 }
3629 #endif
3630
3631 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3632 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3633 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3634 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3635 *exit_status = EXIT_SECUREBITS;
3636 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3637 }
3638
3639 if (context_has_no_new_privileges(context))
3640 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3641 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3642 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3643 }
3644
3645 #if HAVE_SECCOMP
3646 r = apply_address_families(unit, context);
3647 if (r < 0) {
3648 *exit_status = EXIT_ADDRESS_FAMILIES;
3649 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3650 }
3651
3652 r = apply_memory_deny_write_execute(unit, context);
3653 if (r < 0) {
3654 *exit_status = EXIT_SECCOMP;
3655 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3656 }
3657
3658 r = apply_restrict_realtime(unit, context);
3659 if (r < 0) {
3660 *exit_status = EXIT_SECCOMP;
3661 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3662 }
3663
3664 r = apply_restrict_suid_sgid(unit, context);
3665 if (r < 0) {
3666 *exit_status = EXIT_SECCOMP;
3667 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3668 }
3669
3670 r = apply_restrict_namespaces(unit, context);
3671 if (r < 0) {
3672 *exit_status = EXIT_SECCOMP;
3673 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3674 }
3675
3676 r = apply_protect_sysctl(unit, context);
3677 if (r < 0) {
3678 *exit_status = EXIT_SECCOMP;
3679 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3680 }
3681
3682 r = apply_protect_kernel_modules(unit, context);
3683 if (r < 0) {
3684 *exit_status = EXIT_SECCOMP;
3685 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3686 }
3687
3688 r = apply_private_devices(unit, context);
3689 if (r < 0) {
3690 *exit_status = EXIT_SECCOMP;
3691 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3692 }
3693
3694 r = apply_syscall_archs(unit, context);
3695 if (r < 0) {
3696 *exit_status = EXIT_SECCOMP;
3697 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3698 }
3699
3700 r = apply_lock_personality(unit, context);
3701 if (r < 0) {
3702 *exit_status = EXIT_SECCOMP;
3703 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3704 }
3705
3706 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3707 * by the filter as little as possible. */
3708 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3709 if (r < 0) {
3710 *exit_status = EXIT_SECCOMP;
3711 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3712 }
3713 #endif
3714 }
3715
3716 if (!strv_isempty(context->unset_environment)) {
3717 char **ee = NULL;
3718
3719 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3720 if (!ee) {
3721 *exit_status = EXIT_MEMORY;
3722 return log_oom();
3723 }
3724
3725 strv_free_and_replace(accum_env, ee);
3726 }
3727
3728 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3729 replaced_argv = replace_env_argv(command->argv, accum_env);
3730 if (!replaced_argv) {
3731 *exit_status = EXIT_MEMORY;
3732 return log_oom();
3733 }
3734 final_argv = replaced_argv;
3735 } else
3736 final_argv = command->argv;
3737
3738 if (DEBUG_LOGGING) {
3739 _cleanup_free_ char *line;
3740
3741 line = exec_command_line(final_argv);
3742 if (line)
3743 log_struct(LOG_DEBUG,
3744 "EXECUTABLE=%s", command->path,
3745 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3746 LOG_UNIT_ID(unit),
3747 LOG_UNIT_INVOCATION_ID(unit));
3748 }
3749
3750 if (exec_fd >= 0) {
3751 uint8_t hot = 1;
3752
3753 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3754 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3755
3756 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3757 *exit_status = EXIT_EXEC;
3758 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3759 }
3760 }
3761
3762 execve(command->path, final_argv, accum_env);
3763 r = -errno;
3764
3765 if (exec_fd >= 0) {
3766 uint8_t hot = 0;
3767
3768 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3769 * that POLLHUP on it no longer means execve() succeeded. */
3770
3771 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3772 *exit_status = EXIT_EXEC;
3773 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3774 }
3775 }
3776
3777 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3778 log_struct_errno(LOG_INFO, r,
3779 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3780 LOG_UNIT_ID(unit),
3781 LOG_UNIT_INVOCATION_ID(unit),
3782 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3783 command->path),
3784 "EXECUTABLE=%s", command->path);
3785 return 0;
3786 }
3787
3788 *exit_status = EXIT_EXEC;
3789 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3790 }
3791
3792 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3793 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3794
3795 int exec_spawn(Unit *unit,
3796 ExecCommand *command,
3797 const ExecContext *context,
3798 const ExecParameters *params,
3799 ExecRuntime *runtime,
3800 DynamicCreds *dcreds,
3801 pid_t *ret) {
3802
3803 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3804 _cleanup_free_ char *subcgroup_path = NULL;
3805 _cleanup_strv_free_ char **files_env = NULL;
3806 size_t n_storage_fds = 0, n_socket_fds = 0;
3807 _cleanup_free_ char *line = NULL;
3808 pid_t pid;
3809
3810 assert(unit);
3811 assert(command);
3812 assert(context);
3813 assert(ret);
3814 assert(params);
3815 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3816
3817 if (context->std_input == EXEC_INPUT_SOCKET ||
3818 context->std_output == EXEC_OUTPUT_SOCKET ||
3819 context->std_error == EXEC_OUTPUT_SOCKET) {
3820
3821 if (params->n_socket_fds > 1) {
3822 log_unit_error(unit, "Got more than one socket.");
3823 return -EINVAL;
3824 }
3825
3826 if (params->n_socket_fds == 0) {
3827 log_unit_error(unit, "Got no socket.");
3828 return -EINVAL;
3829 }
3830
3831 socket_fd = params->fds[0];
3832 } else {
3833 socket_fd = -1;
3834 fds = params->fds;
3835 n_socket_fds = params->n_socket_fds;
3836 n_storage_fds = params->n_storage_fds;
3837 }
3838
3839 r = exec_context_named_iofds(context, params, named_iofds);
3840 if (r < 0)
3841 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3842
3843 r = exec_context_load_environment(unit, context, &files_env);
3844 if (r < 0)
3845 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3846
3847 line = exec_command_line(command->argv);
3848 if (!line)
3849 return log_oom();
3850
3851 log_struct(LOG_DEBUG,
3852 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3853 "EXECUTABLE=%s", command->path,
3854 LOG_UNIT_ID(unit),
3855 LOG_UNIT_INVOCATION_ID(unit));
3856
3857 if (params->cgroup_path) {
3858 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3859 if (r < 0)
3860 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3861 if (r > 0) { /* We are using a child cgroup */
3862 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3863 if (r < 0)
3864 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3865 }
3866 }
3867
3868 pid = fork();
3869 if (pid < 0)
3870 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3871
3872 if (pid == 0) {
3873 int exit_status = EXIT_SUCCESS;
3874
3875 r = exec_child(unit,
3876 command,
3877 context,
3878 params,
3879 runtime,
3880 dcreds,
3881 socket_fd,
3882 named_iofds,
3883 fds,
3884 n_socket_fds,
3885 n_storage_fds,
3886 files_env,
3887 unit->manager->user_lookup_fds[1],
3888 &exit_status);
3889
3890 if (r < 0) {
3891 const char *status =
3892 exit_status_to_string(exit_status,
3893 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
3894
3895 log_struct_errno(LOG_ERR, r,
3896 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3897 LOG_UNIT_ID(unit),
3898 LOG_UNIT_INVOCATION_ID(unit),
3899 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3900 status, command->path),
3901 "EXECUTABLE=%s", command->path);
3902 }
3903
3904 _exit(exit_status);
3905 }
3906
3907 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3908
3909 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3910 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3911 * process will be killed too). */
3912 if (subcgroup_path)
3913 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3914
3915 exec_status_start(&command->exec_status, pid);
3916
3917 *ret = pid;
3918 return 0;
3919 }
3920
3921 void exec_context_init(ExecContext *c) {
3922 ExecDirectoryType i;
3923
3924 assert(c);
3925
3926 c->umask = 0022;
3927 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3928 c->cpu_sched_policy = SCHED_OTHER;
3929 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3930 c->syslog_level_prefix = true;
3931 c->ignore_sigpipe = true;
3932 c->timer_slack_nsec = NSEC_INFINITY;
3933 c->personality = PERSONALITY_INVALID;
3934 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3935 c->directories[i].mode = 0755;
3936 c->timeout_clean_usec = USEC_INFINITY;
3937 c->capability_bounding_set = CAP_ALL;
3938 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3939 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3940 c->log_level_max = -1;
3941 numa_policy_reset(&c->numa_policy);
3942 }
3943
3944 void exec_context_done(ExecContext *c) {
3945 ExecDirectoryType i;
3946 size_t l;
3947
3948 assert(c);
3949
3950 c->environment = strv_free(c->environment);
3951 c->environment_files = strv_free(c->environment_files);
3952 c->pass_environment = strv_free(c->pass_environment);
3953 c->unset_environment = strv_free(c->unset_environment);
3954
3955 rlimit_free_all(c->rlimit);
3956
3957 for (l = 0; l < 3; l++) {
3958 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3959 c->stdio_file[l] = mfree(c->stdio_file[l]);
3960 }
3961
3962 c->working_directory = mfree(c->working_directory);
3963 c->root_directory = mfree(c->root_directory);
3964 c->root_image = mfree(c->root_image);
3965 c->tty_path = mfree(c->tty_path);
3966 c->syslog_identifier = mfree(c->syslog_identifier);
3967 c->user = mfree(c->user);
3968 c->group = mfree(c->group);
3969
3970 c->supplementary_groups = strv_free(c->supplementary_groups);
3971
3972 c->pam_name = mfree(c->pam_name);
3973
3974 c->read_only_paths = strv_free(c->read_only_paths);
3975 c->read_write_paths = strv_free(c->read_write_paths);
3976 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3977
3978 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3979 c->bind_mounts = NULL;
3980 c->n_bind_mounts = 0;
3981 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3982 c->temporary_filesystems = NULL;
3983 c->n_temporary_filesystems = 0;
3984
3985 cpu_set_reset(&c->cpu_set);
3986 numa_policy_reset(&c->numa_policy);
3987
3988 c->utmp_id = mfree(c->utmp_id);
3989 c->selinux_context = mfree(c->selinux_context);
3990 c->apparmor_profile = mfree(c->apparmor_profile);
3991 c->smack_process_label = mfree(c->smack_process_label);
3992
3993 c->syscall_filter = hashmap_free(c->syscall_filter);
3994 c->syscall_archs = set_free(c->syscall_archs);
3995 c->address_families = set_free(c->address_families);
3996
3997 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3998 c->directories[i].paths = strv_free(c->directories[i].paths);
3999
4000 c->log_level_max = -1;
4001
4002 exec_context_free_log_extra_fields(c);
4003
4004 c->log_ratelimit_interval_usec = 0;
4005 c->log_ratelimit_burst = 0;
4006
4007 c->stdin_data = mfree(c->stdin_data);
4008 c->stdin_data_size = 0;
4009
4010 c->network_namespace_path = mfree(c->network_namespace_path);
4011 }
4012
4013 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4014 char **i;
4015
4016 assert(c);
4017
4018 if (!runtime_prefix)
4019 return 0;
4020
4021 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4022 _cleanup_free_ char *p;
4023
4024 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4025 p = path_join(runtime_prefix, "private", *i);
4026 else
4027 p = path_join(runtime_prefix, *i);
4028 if (!p)
4029 return -ENOMEM;
4030
4031 /* We execute this synchronously, since we need to be sure this is gone when we start the
4032 * service next. */
4033 (void) rm_rf(p, REMOVE_ROOT);
4034 }
4035
4036 return 0;
4037 }
4038
4039 static void exec_command_done(ExecCommand *c) {
4040 assert(c);
4041
4042 c->path = mfree(c->path);
4043 c->argv = strv_free(c->argv);
4044 }
4045
4046 void exec_command_done_array(ExecCommand *c, size_t n) {
4047 size_t i;
4048
4049 for (i = 0; i < n; i++)
4050 exec_command_done(c+i);
4051 }
4052
4053 ExecCommand* exec_command_free_list(ExecCommand *c) {
4054 ExecCommand *i;
4055
4056 while ((i = c)) {
4057 LIST_REMOVE(command, c, i);
4058 exec_command_done(i);
4059 free(i);
4060 }
4061
4062 return NULL;
4063 }
4064
4065 void exec_command_free_array(ExecCommand **c, size_t n) {
4066 size_t i;
4067
4068 for (i = 0; i < n; i++)
4069 c[i] = exec_command_free_list(c[i]);
4070 }
4071
4072 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4073 size_t i;
4074
4075 for (i = 0; i < n; i++)
4076 exec_status_reset(&c[i].exec_status);
4077 }
4078
4079 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4080 size_t i;
4081
4082 for (i = 0; i < n; i++) {
4083 ExecCommand *z;
4084
4085 LIST_FOREACH(command, z, c[i])
4086 exec_status_reset(&z->exec_status);
4087 }
4088 }
4089
4090 typedef struct InvalidEnvInfo {
4091 const Unit *unit;
4092 const char *path;
4093 } InvalidEnvInfo;
4094
4095 static void invalid_env(const char *p, void *userdata) {
4096 InvalidEnvInfo *info = userdata;
4097
4098 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4099 }
4100
4101 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4102 assert(c);
4103
4104 switch (fd_index) {
4105
4106 case STDIN_FILENO:
4107 if (c->std_input != EXEC_INPUT_NAMED_FD)
4108 return NULL;
4109
4110 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4111
4112 case STDOUT_FILENO:
4113 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4114 return NULL;
4115
4116 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4117
4118 case STDERR_FILENO:
4119 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4120 return NULL;
4121
4122 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4123
4124 default:
4125 return NULL;
4126 }
4127 }
4128
4129 static int exec_context_named_iofds(
4130 const ExecContext *c,
4131 const ExecParameters *p,
4132 int named_iofds[static 3]) {
4133
4134 size_t i, targets;
4135 const char* stdio_fdname[3];
4136 size_t n_fds;
4137
4138 assert(c);
4139 assert(p);
4140 assert(named_iofds);
4141
4142 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4143 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4144 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4145
4146 for (i = 0; i < 3; i++)
4147 stdio_fdname[i] = exec_context_fdname(c, i);
4148
4149 n_fds = p->n_storage_fds + p->n_socket_fds;
4150
4151 for (i = 0; i < n_fds && targets > 0; i++)
4152 if (named_iofds[STDIN_FILENO] < 0 &&
4153 c->std_input == EXEC_INPUT_NAMED_FD &&
4154 stdio_fdname[STDIN_FILENO] &&
4155 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4156
4157 named_iofds[STDIN_FILENO] = p->fds[i];
4158 targets--;
4159
4160 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4161 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4162 stdio_fdname[STDOUT_FILENO] &&
4163 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4164
4165 named_iofds[STDOUT_FILENO] = p->fds[i];
4166 targets--;
4167
4168 } else if (named_iofds[STDERR_FILENO] < 0 &&
4169 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4170 stdio_fdname[STDERR_FILENO] &&
4171 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4172
4173 named_iofds[STDERR_FILENO] = p->fds[i];
4174 targets--;
4175 }
4176
4177 return targets == 0 ? 0 : -ENOENT;
4178 }
4179
4180 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4181 char **i, **r = NULL;
4182
4183 assert(c);
4184 assert(l);
4185
4186 STRV_FOREACH(i, c->environment_files) {
4187 char *fn;
4188 int k;
4189 unsigned n;
4190 bool ignore = false;
4191 char **p;
4192 _cleanup_globfree_ glob_t pglob = {};
4193
4194 fn = *i;
4195
4196 if (fn[0] == '-') {
4197 ignore = true;
4198 fn++;
4199 }
4200
4201 if (!path_is_absolute(fn)) {
4202 if (ignore)
4203 continue;
4204
4205 strv_free(r);
4206 return -EINVAL;
4207 }
4208
4209 /* Filename supports globbing, take all matching files */
4210 k = safe_glob(fn, 0, &pglob);
4211 if (k < 0) {
4212 if (ignore)
4213 continue;
4214
4215 strv_free(r);
4216 return k;
4217 }
4218
4219 /* When we don't match anything, -ENOENT should be returned */
4220 assert(pglob.gl_pathc > 0);
4221
4222 for (n = 0; n < pglob.gl_pathc; n++) {
4223 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4224 if (k < 0) {
4225 if (ignore)
4226 continue;
4227
4228 strv_free(r);
4229 return k;
4230 }
4231 /* Log invalid environment variables with filename */
4232 if (p) {
4233 InvalidEnvInfo info = {
4234 .unit = unit,
4235 .path = pglob.gl_pathv[n]
4236 };
4237
4238 p = strv_env_clean_with_callback(p, invalid_env, &info);
4239 }
4240
4241 if (!r)
4242 r = p;
4243 else {
4244 char **m;
4245
4246 m = strv_env_merge(2, r, p);
4247 strv_free(r);
4248 strv_free(p);
4249 if (!m)
4250 return -ENOMEM;
4251
4252 r = m;
4253 }
4254 }
4255 }
4256
4257 *l = r;
4258
4259 return 0;
4260 }
4261
4262 static bool tty_may_match_dev_console(const char *tty) {
4263 _cleanup_free_ char *resolved = NULL;
4264
4265 if (!tty)
4266 return true;
4267
4268 tty = skip_dev_prefix(tty);
4269
4270 /* trivial identity? */
4271 if (streq(tty, "console"))
4272 return true;
4273
4274 if (resolve_dev_console(&resolved) < 0)
4275 return true; /* if we could not resolve, assume it may */
4276
4277 /* "tty0" means the active VC, so it may be the same sometimes */
4278 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4279 }
4280
4281 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4282 assert(ec);
4283
4284 return ec->tty_reset ||
4285 ec->tty_vhangup ||
4286 ec->tty_vt_disallocate ||
4287 is_terminal_input(ec->std_input) ||
4288 is_terminal_output(ec->std_output) ||
4289 is_terminal_output(ec->std_error);
4290 }
4291
4292 bool exec_context_may_touch_console(const ExecContext *ec) {
4293
4294 return exec_context_may_touch_tty(ec) &&
4295 tty_may_match_dev_console(exec_context_tty_path(ec));
4296 }
4297
4298 static void strv_fprintf(FILE *f, char **l) {
4299 char **g;
4300
4301 assert(f);
4302
4303 STRV_FOREACH(g, l)
4304 fprintf(f, " %s", *g);
4305 }
4306
4307 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4308 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4309 ExecDirectoryType dt;
4310 unsigned i;
4311 int r;
4312
4313 assert(c);
4314 assert(f);
4315
4316 prefix = strempty(prefix);
4317
4318 fprintf(f,
4319 "%sUMask: %04o\n"
4320 "%sWorkingDirectory: %s\n"
4321 "%sRootDirectory: %s\n"
4322 "%sNonBlocking: %s\n"
4323 "%sPrivateTmp: %s\n"
4324 "%sPrivateDevices: %s\n"
4325 "%sProtectKernelTunables: %s\n"
4326 "%sProtectKernelModules: %s\n"
4327 "%sProtectControlGroups: %s\n"
4328 "%sPrivateNetwork: %s\n"
4329 "%sPrivateUsers: %s\n"
4330 "%sProtectHome: %s\n"
4331 "%sProtectSystem: %s\n"
4332 "%sMountAPIVFS: %s\n"
4333 "%sIgnoreSIGPIPE: %s\n"
4334 "%sMemoryDenyWriteExecute: %s\n"
4335 "%sRestrictRealtime: %s\n"
4336 "%sRestrictSUIDSGID: %s\n"
4337 "%sKeyringMode: %s\n"
4338 "%sProtectHostname: %s\n",
4339 prefix, c->umask,
4340 prefix, c->working_directory ? c->working_directory : "/",
4341 prefix, c->root_directory ? c->root_directory : "/",
4342 prefix, yes_no(c->non_blocking),
4343 prefix, yes_no(c->private_tmp),
4344 prefix, yes_no(c->private_devices),
4345 prefix, yes_no(c->protect_kernel_tunables),
4346 prefix, yes_no(c->protect_kernel_modules),
4347 prefix, yes_no(c->protect_control_groups),
4348 prefix, yes_no(c->private_network),
4349 prefix, yes_no(c->private_users),
4350 prefix, protect_home_to_string(c->protect_home),
4351 prefix, protect_system_to_string(c->protect_system),
4352 prefix, yes_no(c->mount_apivfs),
4353 prefix, yes_no(c->ignore_sigpipe),
4354 prefix, yes_no(c->memory_deny_write_execute),
4355 prefix, yes_no(c->restrict_realtime),
4356 prefix, yes_no(c->restrict_suid_sgid),
4357 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4358 prefix, yes_no(c->protect_hostname));
4359
4360 if (c->root_image)
4361 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4362
4363 STRV_FOREACH(e, c->environment)
4364 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4365
4366 STRV_FOREACH(e, c->environment_files)
4367 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4368
4369 STRV_FOREACH(e, c->pass_environment)
4370 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4371
4372 STRV_FOREACH(e, c->unset_environment)
4373 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4374
4375 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4376
4377 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4378 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4379
4380 STRV_FOREACH(d, c->directories[dt].paths)
4381 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4382 }
4383
4384 fprintf(f,
4385 "%sTimeoutCleanSec: %s\n",
4386 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4387
4388 if (c->nice_set)
4389 fprintf(f,
4390 "%sNice: %i\n",
4391 prefix, c->nice);
4392
4393 if (c->oom_score_adjust_set)
4394 fprintf(f,
4395 "%sOOMScoreAdjust: %i\n",
4396 prefix, c->oom_score_adjust);
4397
4398 for (i = 0; i < RLIM_NLIMITS; i++)
4399 if (c->rlimit[i]) {
4400 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4401 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4402 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4403 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4404 }
4405
4406 if (c->ioprio_set) {
4407 _cleanup_free_ char *class_str = NULL;
4408
4409 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4410 if (r >= 0)
4411 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4412
4413 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4414 }
4415
4416 if (c->cpu_sched_set) {
4417 _cleanup_free_ char *policy_str = NULL;
4418
4419 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4420 if (r >= 0)
4421 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4422
4423 fprintf(f,
4424 "%sCPUSchedulingPriority: %i\n"
4425 "%sCPUSchedulingResetOnFork: %s\n",
4426 prefix, c->cpu_sched_priority,
4427 prefix, yes_no(c->cpu_sched_reset_on_fork));
4428 }
4429
4430 if (c->cpu_set.set) {
4431 _cleanup_free_ char *affinity = NULL;
4432
4433 affinity = cpu_set_to_range_string(&c->cpu_set);
4434 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4435 }
4436
4437 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4438 _cleanup_free_ char *nodes = NULL;
4439
4440 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4441 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4442 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4443 }
4444
4445 if (c->timer_slack_nsec != NSEC_INFINITY)
4446 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4447
4448 fprintf(f,
4449 "%sStandardInput: %s\n"
4450 "%sStandardOutput: %s\n"
4451 "%sStandardError: %s\n",
4452 prefix, exec_input_to_string(c->std_input),
4453 prefix, exec_output_to_string(c->std_output),
4454 prefix, exec_output_to_string(c->std_error));
4455
4456 if (c->std_input == EXEC_INPUT_NAMED_FD)
4457 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4458 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4459 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4460 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4461 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4462
4463 if (c->std_input == EXEC_INPUT_FILE)
4464 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4465 if (c->std_output == EXEC_OUTPUT_FILE)
4466 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4467 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4468 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4469 if (c->std_error == EXEC_OUTPUT_FILE)
4470 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4471 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4472 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4473
4474 if (c->tty_path)
4475 fprintf(f,
4476 "%sTTYPath: %s\n"
4477 "%sTTYReset: %s\n"
4478 "%sTTYVHangup: %s\n"
4479 "%sTTYVTDisallocate: %s\n",
4480 prefix, c->tty_path,
4481 prefix, yes_no(c->tty_reset),
4482 prefix, yes_no(c->tty_vhangup),
4483 prefix, yes_no(c->tty_vt_disallocate));
4484
4485 if (IN_SET(c->std_output,
4486 EXEC_OUTPUT_SYSLOG,
4487 EXEC_OUTPUT_KMSG,
4488 EXEC_OUTPUT_JOURNAL,
4489 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4490 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4491 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4492 IN_SET(c->std_error,
4493 EXEC_OUTPUT_SYSLOG,
4494 EXEC_OUTPUT_KMSG,
4495 EXEC_OUTPUT_JOURNAL,
4496 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4497 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4498 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4499
4500 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4501
4502 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4503 if (r >= 0)
4504 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4505
4506 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4507 if (r >= 0)
4508 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4509 }
4510
4511 if (c->log_level_max >= 0) {
4512 _cleanup_free_ char *t = NULL;
4513
4514 (void) log_level_to_string_alloc(c->log_level_max, &t);
4515
4516 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4517 }
4518
4519 if (c->log_ratelimit_interval_usec > 0) {
4520 char buf_timespan[FORMAT_TIMESPAN_MAX];
4521
4522 fprintf(f,
4523 "%sLogRateLimitIntervalSec: %s\n",
4524 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4525 }
4526
4527 if (c->log_ratelimit_burst > 0)
4528 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4529
4530 if (c->n_log_extra_fields > 0) {
4531 size_t j;
4532
4533 for (j = 0; j < c->n_log_extra_fields; j++) {
4534 fprintf(f, "%sLogExtraFields: ", prefix);
4535 fwrite(c->log_extra_fields[j].iov_base,
4536 1, c->log_extra_fields[j].iov_len,
4537 f);
4538 fputc('\n', f);
4539 }
4540 }
4541
4542 if (c->secure_bits) {
4543 _cleanup_free_ char *str = NULL;
4544
4545 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4546 if (r >= 0)
4547 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4548 }
4549
4550 if (c->capability_bounding_set != CAP_ALL) {
4551 _cleanup_free_ char *str = NULL;
4552
4553 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4554 if (r >= 0)
4555 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4556 }
4557
4558 if (c->capability_ambient_set != 0) {
4559 _cleanup_free_ char *str = NULL;
4560
4561 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4562 if (r >= 0)
4563 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4564 }
4565
4566 if (c->user)
4567 fprintf(f, "%sUser: %s\n", prefix, c->user);
4568 if (c->group)
4569 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4570
4571 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4572
4573 if (!strv_isempty(c->supplementary_groups)) {
4574 fprintf(f, "%sSupplementaryGroups:", prefix);
4575 strv_fprintf(f, c->supplementary_groups);
4576 fputs("\n", f);
4577 }
4578
4579 if (c->pam_name)
4580 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4581
4582 if (!strv_isempty(c->read_write_paths)) {
4583 fprintf(f, "%sReadWritePaths:", prefix);
4584 strv_fprintf(f, c->read_write_paths);
4585 fputs("\n", f);
4586 }
4587
4588 if (!strv_isempty(c->read_only_paths)) {
4589 fprintf(f, "%sReadOnlyPaths:", prefix);
4590 strv_fprintf(f, c->read_only_paths);
4591 fputs("\n", f);
4592 }
4593
4594 if (!strv_isempty(c->inaccessible_paths)) {
4595 fprintf(f, "%sInaccessiblePaths:", prefix);
4596 strv_fprintf(f, c->inaccessible_paths);
4597 fputs("\n", f);
4598 }
4599
4600 if (c->n_bind_mounts > 0)
4601 for (i = 0; i < c->n_bind_mounts; i++)
4602 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4603 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4604 c->bind_mounts[i].ignore_enoent ? "-": "",
4605 c->bind_mounts[i].source,
4606 c->bind_mounts[i].destination,
4607 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4608
4609 if (c->n_temporary_filesystems > 0)
4610 for (i = 0; i < c->n_temporary_filesystems; i++) {
4611 TemporaryFileSystem *t = c->temporary_filesystems + i;
4612
4613 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4614 t->path,
4615 isempty(t->options) ? "" : ":",
4616 strempty(t->options));
4617 }
4618
4619 if (c->utmp_id)
4620 fprintf(f,
4621 "%sUtmpIdentifier: %s\n",
4622 prefix, c->utmp_id);
4623
4624 if (c->selinux_context)
4625 fprintf(f,
4626 "%sSELinuxContext: %s%s\n",
4627 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4628
4629 if (c->apparmor_profile)
4630 fprintf(f,
4631 "%sAppArmorProfile: %s%s\n",
4632 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4633
4634 if (c->smack_process_label)
4635 fprintf(f,
4636 "%sSmackProcessLabel: %s%s\n",
4637 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4638
4639 if (c->personality != PERSONALITY_INVALID)
4640 fprintf(f,
4641 "%sPersonality: %s\n",
4642 prefix, strna(personality_to_string(c->personality)));
4643
4644 fprintf(f,
4645 "%sLockPersonality: %s\n",
4646 prefix, yes_no(c->lock_personality));
4647
4648 if (c->syscall_filter) {
4649 #if HAVE_SECCOMP
4650 Iterator j;
4651 void *id, *val;
4652 bool first = true;
4653 #endif
4654
4655 fprintf(f,
4656 "%sSystemCallFilter: ",
4657 prefix);
4658
4659 if (!c->syscall_whitelist)
4660 fputc('~', f);
4661
4662 #if HAVE_SECCOMP
4663 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4664 _cleanup_free_ char *name = NULL;
4665 const char *errno_name = NULL;
4666 int num = PTR_TO_INT(val);
4667
4668 if (first)
4669 first = false;
4670 else
4671 fputc(' ', f);
4672
4673 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4674 fputs(strna(name), f);
4675
4676 if (num >= 0) {
4677 errno_name = errno_to_name(num);
4678 if (errno_name)
4679 fprintf(f, ":%s", errno_name);
4680 else
4681 fprintf(f, ":%d", num);
4682 }
4683 }
4684 #endif
4685
4686 fputc('\n', f);
4687 }
4688
4689 if (c->syscall_archs) {
4690 #if HAVE_SECCOMP
4691 Iterator j;
4692 void *id;
4693 #endif
4694
4695 fprintf(f,
4696 "%sSystemCallArchitectures:",
4697 prefix);
4698
4699 #if HAVE_SECCOMP
4700 SET_FOREACH(id, c->syscall_archs, j)
4701 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4702 #endif
4703 fputc('\n', f);
4704 }
4705
4706 if (exec_context_restrict_namespaces_set(c)) {
4707 _cleanup_free_ char *s = NULL;
4708
4709 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4710 if (r >= 0)
4711 fprintf(f, "%sRestrictNamespaces: %s\n",
4712 prefix, s);
4713 }
4714
4715 if (c->network_namespace_path)
4716 fprintf(f,
4717 "%sNetworkNamespacePath: %s\n",
4718 prefix, c->network_namespace_path);
4719
4720 if (c->syscall_errno > 0) {
4721 const char *errno_name;
4722
4723 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4724
4725 errno_name = errno_to_name(c->syscall_errno);
4726 if (errno_name)
4727 fprintf(f, "%s\n", errno_name);
4728 else
4729 fprintf(f, "%d\n", c->syscall_errno);
4730 }
4731 }
4732
4733 bool exec_context_maintains_privileges(const ExecContext *c) {
4734 assert(c);
4735
4736 /* Returns true if the process forked off would run under
4737 * an unchanged UID or as root. */
4738
4739 if (!c->user)
4740 return true;
4741
4742 if (streq(c->user, "root") || streq(c->user, "0"))
4743 return true;
4744
4745 return false;
4746 }
4747
4748 int exec_context_get_effective_ioprio(const ExecContext *c) {
4749 int p;
4750
4751 assert(c);
4752
4753 if (c->ioprio_set)
4754 return c->ioprio;
4755
4756 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4757 if (p < 0)
4758 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4759
4760 return p;
4761 }
4762
4763 void exec_context_free_log_extra_fields(ExecContext *c) {
4764 size_t l;
4765
4766 assert(c);
4767
4768 for (l = 0; l < c->n_log_extra_fields; l++)
4769 free(c->log_extra_fields[l].iov_base);
4770 c->log_extra_fields = mfree(c->log_extra_fields);
4771 c->n_log_extra_fields = 0;
4772 }
4773
4774 void exec_context_revert_tty(ExecContext *c) {
4775 int r;
4776
4777 assert(c);
4778
4779 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4780 exec_context_tty_reset(c, NULL);
4781
4782 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4783 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4784 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4785
4786 if (exec_context_may_touch_tty(c)) {
4787 const char *path;
4788
4789 path = exec_context_tty_path(c);
4790 if (path) {
4791 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4792 if (r < 0 && r != -ENOENT)
4793 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4794 }
4795 }
4796 }
4797
4798 int exec_context_get_clean_directories(
4799 ExecContext *c,
4800 char **prefix,
4801 ExecCleanMask mask,
4802 char ***ret) {
4803
4804 _cleanup_strv_free_ char **l = NULL;
4805 ExecDirectoryType t;
4806 int r;
4807
4808 assert(c);
4809 assert(prefix);
4810 assert(ret);
4811
4812 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4813 char **i;
4814
4815 if (!FLAGS_SET(mask, 1U << t))
4816 continue;
4817
4818 if (!prefix[t])
4819 continue;
4820
4821 STRV_FOREACH(i, c->directories[t].paths) {
4822 char *j;
4823
4824 j = path_join(prefix[t], *i);
4825 if (!j)
4826 return -ENOMEM;
4827
4828 r = strv_consume(&l, j);
4829 if (r < 0)
4830 return r;
4831
4832 /* Also remove private directories unconditionally. */
4833 if (t != EXEC_DIRECTORY_CONFIGURATION) {
4834 j = path_join(prefix[t], "private", *i);
4835 if (!j)
4836 return -ENOMEM;
4837
4838 r = strv_consume(&l, j);
4839 if (r < 0)
4840 return r;
4841 }
4842 }
4843 }
4844
4845 *ret = TAKE_PTR(l);
4846 return 0;
4847 }
4848
4849 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
4850 ExecCleanMask mask = 0;
4851
4852 assert(c);
4853 assert(ret);
4854
4855 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4856 if (!strv_isempty(c->directories[t].paths))
4857 mask |= 1U << t;
4858
4859 *ret = mask;
4860 return 0;
4861 }
4862
4863 void exec_status_start(ExecStatus *s, pid_t pid) {
4864 assert(s);
4865
4866 *s = (ExecStatus) {
4867 .pid = pid,
4868 };
4869
4870 dual_timestamp_get(&s->start_timestamp);
4871 }
4872
4873 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4874 assert(s);
4875
4876 if (s->pid != pid) {
4877 *s = (ExecStatus) {
4878 .pid = pid,
4879 };
4880 }
4881
4882 dual_timestamp_get(&s->exit_timestamp);
4883
4884 s->code = code;
4885 s->status = status;
4886
4887 if (context && context->utmp_id)
4888 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4889 }
4890
4891 void exec_status_reset(ExecStatus *s) {
4892 assert(s);
4893
4894 *s = (ExecStatus) {};
4895 }
4896
4897 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4898 char buf[FORMAT_TIMESTAMP_MAX];
4899
4900 assert(s);
4901 assert(f);
4902
4903 if (s->pid <= 0)
4904 return;
4905
4906 prefix = strempty(prefix);
4907
4908 fprintf(f,
4909 "%sPID: "PID_FMT"\n",
4910 prefix, s->pid);
4911
4912 if (dual_timestamp_is_set(&s->start_timestamp))
4913 fprintf(f,
4914 "%sStart Timestamp: %s\n",
4915 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4916
4917 if (dual_timestamp_is_set(&s->exit_timestamp))
4918 fprintf(f,
4919 "%sExit Timestamp: %s\n"
4920 "%sExit Code: %s\n"
4921 "%sExit Status: %i\n",
4922 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4923 prefix, sigchld_code_to_string(s->code),
4924 prefix, s->status);
4925 }
4926
4927 static char *exec_command_line(char **argv) {
4928 size_t k;
4929 char *n, *p, **a;
4930 bool first = true;
4931
4932 assert(argv);
4933
4934 k = 1;
4935 STRV_FOREACH(a, argv)
4936 k += strlen(*a)+3;
4937
4938 n = new(char, k);
4939 if (!n)
4940 return NULL;
4941
4942 p = n;
4943 STRV_FOREACH(a, argv) {
4944
4945 if (!first)
4946 *(p++) = ' ';
4947 else
4948 first = false;
4949
4950 if (strpbrk(*a, WHITESPACE)) {
4951 *(p++) = '\'';
4952 p = stpcpy(p, *a);
4953 *(p++) = '\'';
4954 } else
4955 p = stpcpy(p, *a);
4956
4957 }
4958
4959 *p = 0;
4960
4961 /* FIXME: this doesn't really handle arguments that have
4962 * spaces and ticks in them */
4963
4964 return n;
4965 }
4966
4967 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4968 _cleanup_free_ char *cmd = NULL;
4969 const char *prefix2;
4970
4971 assert(c);
4972 assert(f);
4973
4974 prefix = strempty(prefix);
4975 prefix2 = strjoina(prefix, "\t");
4976
4977 cmd = exec_command_line(c->argv);
4978 fprintf(f,
4979 "%sCommand Line: %s\n",
4980 prefix, cmd ? cmd : strerror_safe(ENOMEM));
4981
4982 exec_status_dump(&c->exec_status, f, prefix2);
4983 }
4984
4985 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4986 assert(f);
4987
4988 prefix = strempty(prefix);
4989
4990 LIST_FOREACH(command, c, c)
4991 exec_command_dump(c, f, prefix);
4992 }
4993
4994 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4995 ExecCommand *end;
4996
4997 assert(l);
4998 assert(e);
4999
5000 if (*l) {
5001 /* It's kind of important, that we keep the order here */
5002 LIST_FIND_TAIL(command, *l, end);
5003 LIST_INSERT_AFTER(command, *l, end, e);
5004 } else
5005 *l = e;
5006 }
5007
5008 int exec_command_set(ExecCommand *c, const char *path, ...) {
5009 va_list ap;
5010 char **l, *p;
5011
5012 assert(c);
5013 assert(path);
5014
5015 va_start(ap, path);
5016 l = strv_new_ap(path, ap);
5017 va_end(ap);
5018
5019 if (!l)
5020 return -ENOMEM;
5021
5022 p = strdup(path);
5023 if (!p) {
5024 strv_free(l);
5025 return -ENOMEM;
5026 }
5027
5028 free_and_replace(c->path, p);
5029
5030 return strv_free_and_replace(c->argv, l);
5031 }
5032
5033 int exec_command_append(ExecCommand *c, const char *path, ...) {
5034 _cleanup_strv_free_ char **l = NULL;
5035 va_list ap;
5036 int r;
5037
5038 assert(c);
5039 assert(path);
5040
5041 va_start(ap, path);
5042 l = strv_new_ap(path, ap);
5043 va_end(ap);
5044
5045 if (!l)
5046 return -ENOMEM;
5047
5048 r = strv_extend_strv(&c->argv, l, false);
5049 if (r < 0)
5050 return r;
5051
5052 return 0;
5053 }
5054
5055 static void *remove_tmpdir_thread(void *p) {
5056 _cleanup_free_ char *path = p;
5057
5058 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5059 return NULL;
5060 }
5061
5062 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5063 int r;
5064
5065 if (!rt)
5066 return NULL;
5067
5068 if (rt->manager)
5069 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5070
5071 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5072 if (destroy && rt->tmp_dir) {
5073 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5074
5075 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5076 if (r < 0) {
5077 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5078 free(rt->tmp_dir);
5079 }
5080
5081 rt->tmp_dir = NULL;
5082 }
5083
5084 if (destroy && rt->var_tmp_dir) {
5085 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5086
5087 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5088 if (r < 0) {
5089 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5090 free(rt->var_tmp_dir);
5091 }
5092
5093 rt->var_tmp_dir = NULL;
5094 }
5095
5096 rt->id = mfree(rt->id);
5097 rt->tmp_dir = mfree(rt->tmp_dir);
5098 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5099 safe_close_pair(rt->netns_storage_socket);
5100 return mfree(rt);
5101 }
5102
5103 static void exec_runtime_freep(ExecRuntime **rt) {
5104 (void) exec_runtime_free(*rt, false);
5105 }
5106
5107 static int exec_runtime_allocate(ExecRuntime **ret) {
5108 ExecRuntime *n;
5109
5110 assert(ret);
5111
5112 n = new(ExecRuntime, 1);
5113 if (!n)
5114 return -ENOMEM;
5115
5116 *n = (ExecRuntime) {
5117 .netns_storage_socket = { -1, -1 },
5118 };
5119
5120 *ret = n;
5121 return 0;
5122 }
5123
5124 static int exec_runtime_add(
5125 Manager *m,
5126 const char *id,
5127 const char *tmp_dir,
5128 const char *var_tmp_dir,
5129 const int netns_storage_socket[2],
5130 ExecRuntime **ret) {
5131
5132 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5133 int r;
5134
5135 assert(m);
5136 assert(id);
5137
5138 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5139 if (r < 0)
5140 return r;
5141
5142 r = exec_runtime_allocate(&rt);
5143 if (r < 0)
5144 return r;
5145
5146 rt->id = strdup(id);
5147 if (!rt->id)
5148 return -ENOMEM;
5149
5150 if (tmp_dir) {
5151 rt->tmp_dir = strdup(tmp_dir);
5152 if (!rt->tmp_dir)
5153 return -ENOMEM;
5154
5155 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5156 assert(var_tmp_dir);
5157 rt->var_tmp_dir = strdup(var_tmp_dir);
5158 if (!rt->var_tmp_dir)
5159 return -ENOMEM;
5160 }
5161
5162 if (netns_storage_socket) {
5163 rt->netns_storage_socket[0] = netns_storage_socket[0];
5164 rt->netns_storage_socket[1] = netns_storage_socket[1];
5165 }
5166
5167 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5168 if (r < 0)
5169 return r;
5170
5171 rt->manager = m;
5172
5173 if (ret)
5174 *ret = rt;
5175
5176 /* do not remove created ExecRuntime object when the operation succeeds. */
5177 rt = NULL;
5178 return 0;
5179 }
5180
5181 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5182 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5183 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5184 int r;
5185
5186 assert(m);
5187 assert(c);
5188 assert(id);
5189
5190 /* It is not necessary to create ExecRuntime object. */
5191 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5192 return 0;
5193
5194 if (c->private_tmp) {
5195 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5196 if (r < 0)
5197 return r;
5198 }
5199
5200 if (c->private_network || c->network_namespace_path) {
5201 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5202 return -errno;
5203 }
5204
5205 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5206 if (r < 0)
5207 return r;
5208
5209 /* Avoid cleanup */
5210 netns_storage_socket[0] = netns_storage_socket[1] = -1;
5211 return 1;
5212 }
5213
5214 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5215 ExecRuntime *rt;
5216 int r;
5217
5218 assert(m);
5219 assert(id);
5220 assert(ret);
5221
5222 rt = hashmap_get(m->exec_runtime_by_id, id);
5223 if (rt)
5224 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5225 goto ref;
5226
5227 if (!create)
5228 return 0;
5229
5230 /* If not found, then create a new object. */
5231 r = exec_runtime_make(m, c, id, &rt);
5232 if (r <= 0)
5233 /* When r == 0, it is not necessary to create ExecRuntime object. */
5234 return r;
5235
5236 ref:
5237 /* increment reference counter. */
5238 rt->n_ref++;
5239 *ret = rt;
5240 return 1;
5241 }
5242
5243 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5244 if (!rt)
5245 return NULL;
5246
5247 assert(rt->n_ref > 0);
5248
5249 rt->n_ref--;
5250 if (rt->n_ref > 0)
5251 return NULL;
5252
5253 return exec_runtime_free(rt, destroy);
5254 }
5255
5256 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5257 ExecRuntime *rt;
5258 Iterator i;
5259
5260 assert(m);
5261 assert(f);
5262 assert(fds);
5263
5264 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5265 fprintf(f, "exec-runtime=%s", rt->id);
5266
5267 if (rt->tmp_dir)
5268 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5269
5270 if (rt->var_tmp_dir)
5271 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5272
5273 if (rt->netns_storage_socket[0] >= 0) {
5274 int copy;
5275
5276 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5277 if (copy < 0)
5278 return copy;
5279
5280 fprintf(f, " netns-socket-0=%i", copy);
5281 }
5282
5283 if (rt->netns_storage_socket[1] >= 0) {
5284 int copy;
5285
5286 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5287 if (copy < 0)
5288 return copy;
5289
5290 fprintf(f, " netns-socket-1=%i", copy);
5291 }
5292
5293 fputc('\n', f);
5294 }
5295
5296 return 0;
5297 }
5298
5299 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5300 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5301 ExecRuntime *rt;
5302 int r;
5303
5304 /* This is for the migration from old (v237 or earlier) deserialization text.
5305 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5306 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5307 * so or not from the serialized text, then we always creates a new object owned by this. */
5308
5309 assert(u);
5310 assert(key);
5311 assert(value);
5312
5313 /* Manager manages ExecRuntime objects by the unit id.
5314 * So, we omit the serialized text when the unit does not have id (yet?)... */
5315 if (isempty(u->id)) {
5316 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5317 return 0;
5318 }
5319
5320 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5321 if (r < 0) {
5322 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5323 return 0;
5324 }
5325
5326 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5327 if (!rt) {
5328 r = exec_runtime_allocate(&rt_create);
5329 if (r < 0)
5330 return log_oom();
5331
5332 rt_create->id = strdup(u->id);
5333 if (!rt_create->id)
5334 return log_oom();
5335
5336 rt = rt_create;
5337 }
5338
5339 if (streq(key, "tmp-dir")) {
5340 char *copy;
5341
5342 copy = strdup(value);
5343 if (!copy)
5344 return log_oom();
5345
5346 free_and_replace(rt->tmp_dir, copy);
5347
5348 } else if (streq(key, "var-tmp-dir")) {
5349 char *copy;
5350
5351 copy = strdup(value);
5352 if (!copy)
5353 return log_oom();
5354
5355 free_and_replace(rt->var_tmp_dir, copy);
5356
5357 } else if (streq(key, "netns-socket-0")) {
5358 int fd;
5359
5360 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5361 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5362 return 0;
5363 }
5364
5365 safe_close(rt->netns_storage_socket[0]);
5366 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5367
5368 } else if (streq(key, "netns-socket-1")) {
5369 int fd;
5370
5371 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5372 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5373 return 0;
5374 }
5375
5376 safe_close(rt->netns_storage_socket[1]);
5377 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5378 } else
5379 return 0;
5380
5381 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5382 if (rt_create) {
5383 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5384 if (r < 0) {
5385 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5386 return 0;
5387 }
5388
5389 rt_create->manager = u->manager;
5390
5391 /* Avoid cleanup */
5392 rt_create = NULL;
5393 }
5394
5395 return 1;
5396 }
5397
5398 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5399 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5400 int r, fd0 = -1, fd1 = -1;
5401 const char *p, *v = value;
5402 size_t n;
5403
5404 assert(m);
5405 assert(value);
5406 assert(fds);
5407
5408 n = strcspn(v, " ");
5409 id = strndupa(v, n);
5410 if (v[n] != ' ')
5411 goto finalize;
5412 p = v + n + 1;
5413
5414 v = startswith(p, "tmp-dir=");
5415 if (v) {
5416 n = strcspn(v, " ");
5417 tmp_dir = strndupa(v, n);
5418 if (v[n] != ' ')
5419 goto finalize;
5420 p = v + n + 1;
5421 }
5422
5423 v = startswith(p, "var-tmp-dir=");
5424 if (v) {
5425 n = strcspn(v, " ");
5426 var_tmp_dir = strndupa(v, n);
5427 if (v[n] != ' ')
5428 goto finalize;
5429 p = v + n + 1;
5430 }
5431
5432 v = startswith(p, "netns-socket-0=");
5433 if (v) {
5434 char *buf;
5435
5436 n = strcspn(v, " ");
5437 buf = strndupa(v, n);
5438 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5439 log_debug("Unable to process exec-runtime netns fd specification.");
5440 return;
5441 }
5442 fd0 = fdset_remove(fds, fd0);
5443 if (v[n] != ' ')
5444 goto finalize;
5445 p = v + n + 1;
5446 }
5447
5448 v = startswith(p, "netns-socket-1=");
5449 if (v) {
5450 char *buf;
5451
5452 n = strcspn(v, " ");
5453 buf = strndupa(v, n);
5454 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5455 log_debug("Unable to process exec-runtime netns fd specification.");
5456 return;
5457 }
5458 fd1 = fdset_remove(fds, fd1);
5459 }
5460
5461 finalize:
5462
5463 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5464 if (r < 0)
5465 log_debug_errno(r, "Failed to add exec-runtime: %m");
5466 }
5467
5468 void exec_runtime_vacuum(Manager *m) {
5469 ExecRuntime *rt;
5470 Iterator i;
5471
5472 assert(m);
5473
5474 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5475
5476 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5477 if (rt->n_ref > 0)
5478 continue;
5479
5480 (void) exec_runtime_free(rt, false);
5481 }
5482 }
5483
5484 void exec_params_clear(ExecParameters *p) {
5485 if (!p)
5486 return;
5487
5488 strv_free(p->environment);
5489 }
5490
5491 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5492 [EXEC_INPUT_NULL] = "null",
5493 [EXEC_INPUT_TTY] = "tty",
5494 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5495 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5496 [EXEC_INPUT_SOCKET] = "socket",
5497 [EXEC_INPUT_NAMED_FD] = "fd",
5498 [EXEC_INPUT_DATA] = "data",
5499 [EXEC_INPUT_FILE] = "file",
5500 };
5501
5502 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5503
5504 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5505 [EXEC_OUTPUT_INHERIT] = "inherit",
5506 [EXEC_OUTPUT_NULL] = "null",
5507 [EXEC_OUTPUT_TTY] = "tty",
5508 [EXEC_OUTPUT_SYSLOG] = "syslog",
5509 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5510 [EXEC_OUTPUT_KMSG] = "kmsg",
5511 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5512 [EXEC_OUTPUT_JOURNAL] = "journal",
5513 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5514 [EXEC_OUTPUT_SOCKET] = "socket",
5515 [EXEC_OUTPUT_NAMED_FD] = "fd",
5516 [EXEC_OUTPUT_FILE] = "file",
5517 [EXEC_OUTPUT_FILE_APPEND] = "append",
5518 };
5519
5520 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5521
5522 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5523 [EXEC_UTMP_INIT] = "init",
5524 [EXEC_UTMP_LOGIN] = "login",
5525 [EXEC_UTMP_USER] = "user",
5526 };
5527
5528 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5529
5530 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5531 [EXEC_PRESERVE_NO] = "no",
5532 [EXEC_PRESERVE_YES] = "yes",
5533 [EXEC_PRESERVE_RESTART] = "restart",
5534 };
5535
5536 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5537
5538 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5539 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5540 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5541 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5542 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5543 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5544 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5545 };
5546
5547 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5548
5549 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5550 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5551 * directories, specifically .timer units with their timestamp touch file. */
5552 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5553 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5554 [EXEC_DIRECTORY_STATE] = "state",
5555 [EXEC_DIRECTORY_CACHE] = "cache",
5556 [EXEC_DIRECTORY_LOGS] = "logs",
5557 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5558 };
5559
5560 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5561
5562 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5563 * the service payload in. */
5564 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5565 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5566 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5567 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5568 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5569 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5570 };
5571
5572 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5573
5574 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5575 [EXEC_KEYRING_INHERIT] = "inherit",
5576 [EXEC_KEYRING_PRIVATE] = "private",
5577 [EXEC_KEYRING_SHARED] = "shared",
5578 };
5579
5580 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);