]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #8817 from yuwata/cleanup-nsflags
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6 ***/
7
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <glob.h>
11 #include <grp.h>
12 #include <poll.h>
13 #include <signal.h>
14 #include <string.h>
15 #include <sys/capability.h>
16 #include <sys/eventfd.h>
17 #include <sys/mman.h>
18 #include <sys/personality.h>
19 #include <sys/prctl.h>
20 #include <sys/shm.h>
21 #include <sys/socket.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24 #include <sys/un.h>
25 #include <unistd.h>
26 #include <utmpx.h>
27
28 #if HAVE_PAM
29 #include <security/pam_appl.h>
30 #endif
31
32 #if HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35
36 #if HAVE_SECCOMP
37 #include <seccomp.h>
38 #endif
39
40 #if HAVE_APPARMOR
41 #include <sys/apparmor.h>
42 #endif
43
44 #include "sd-messages.h"
45
46 #include "af-list.h"
47 #include "alloc-util.h"
48 #if HAVE_APPARMOR
49 #include "apparmor-util.h"
50 #endif
51 #include "async.h"
52 #include "barrier.h"
53 #include "cap-list.h"
54 #include "capability-util.h"
55 #include "chown-recursive.h"
56 #include "cpu-set-util.h"
57 #include "def.h"
58 #include "env-util.h"
59 #include "errno-list.h"
60 #include "execute.h"
61 #include "exit-status.h"
62 #include "fd-util.h"
63 #include "fileio.h"
64 #include "format-util.h"
65 #include "fs-util.h"
66 #include "glob-util.h"
67 #include "io-util.h"
68 #include "ioprio.h"
69 #include "label.h"
70 #include "log.h"
71 #include "macro.h"
72 #include "manager.h"
73 #include "missing.h"
74 #include "mkdir.h"
75 #include "namespace.h"
76 #include "parse-util.h"
77 #include "path-util.h"
78 #include "process-util.h"
79 #include "rlimit-util.h"
80 #include "rm-rf.h"
81 #if HAVE_SECCOMP
82 #include "seccomp-util.h"
83 #endif
84 #include "securebits.h"
85 #include "securebits-util.h"
86 #include "selinux-util.h"
87 #include "signal-util.h"
88 #include "smack-util.h"
89 #include "socket-util.h"
90 #include "special.h"
91 #include "stat-util.h"
92 #include "string-table.h"
93 #include "string-util.h"
94 #include "strv.h"
95 #include "syslog-util.h"
96 #include "terminal-util.h"
97 #include "unit.h"
98 #include "user-util.h"
99 #include "util.h"
100 #include "utmp-wtmp.h"
101
102 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
103 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
104
105 /* This assumes there is a 'tty' group */
106 #define TTY_MODE 0620
107
108 #define SNDBUF_SIZE (8*1024*1024)
109
110 static int shift_fds(int fds[], size_t n_fds) {
111 int start, restart_from;
112
113 if (n_fds <= 0)
114 return 0;
115
116 /* Modifies the fds array! (sorts it) */
117
118 assert(fds);
119
120 start = 0;
121 for (;;) {
122 int i;
123
124 restart_from = -1;
125
126 for (i = start; i < (int) n_fds; i++) {
127 int nfd;
128
129 /* Already at right index? */
130 if (fds[i] == i+3)
131 continue;
132
133 nfd = fcntl(fds[i], F_DUPFD, i + 3);
134 if (nfd < 0)
135 return -errno;
136
137 safe_close(fds[i]);
138 fds[i] = nfd;
139
140 /* Hmm, the fd we wanted isn't free? Then
141 * let's remember that and try again from here */
142 if (nfd != i+3 && restart_from < 0)
143 restart_from = i;
144 }
145
146 if (restart_from < 0)
147 break;
148
149 start = restart_from;
150 }
151
152 return 0;
153 }
154
155 static int flags_fds(const int fds[], size_t n_storage_fds, size_t n_socket_fds, bool nonblock) {
156 size_t i, n_fds;
157 int r;
158
159 n_fds = n_storage_fds + n_socket_fds;
160 if (n_fds <= 0)
161 return 0;
162
163 assert(fds);
164
165 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
166 * O_NONBLOCK only applies to socket activation though. */
167
168 for (i = 0; i < n_fds; i++) {
169
170 if (i < n_socket_fds) {
171 r = fd_nonblock(fds[i], nonblock);
172 if (r < 0)
173 return r;
174 }
175
176 /* We unconditionally drop FD_CLOEXEC from the fds,
177 * since after all we want to pass these fds to our
178 * children */
179
180 r = fd_cloexec(fds[i], false);
181 if (r < 0)
182 return r;
183 }
184
185 return 0;
186 }
187
188 static const char *exec_context_tty_path(const ExecContext *context) {
189 assert(context);
190
191 if (context->stdio_as_fds)
192 return NULL;
193
194 if (context->tty_path)
195 return context->tty_path;
196
197 return "/dev/console";
198 }
199
200 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
201 const char *path;
202
203 assert(context);
204
205 path = exec_context_tty_path(context);
206
207 if (context->tty_vhangup) {
208 if (p && p->stdin_fd >= 0)
209 (void) terminal_vhangup_fd(p->stdin_fd);
210 else if (path)
211 (void) terminal_vhangup(path);
212 }
213
214 if (context->tty_reset) {
215 if (p && p->stdin_fd >= 0)
216 (void) reset_terminal_fd(p->stdin_fd, true);
217 else if (path)
218 (void) reset_terminal(path);
219 }
220
221 if (context->tty_vt_disallocate && path)
222 (void) vt_disallocate(path);
223 }
224
225 static bool is_terminal_input(ExecInput i) {
226 return IN_SET(i,
227 EXEC_INPUT_TTY,
228 EXEC_INPUT_TTY_FORCE,
229 EXEC_INPUT_TTY_FAIL);
230 }
231
232 static bool is_terminal_output(ExecOutput o) {
233 return IN_SET(o,
234 EXEC_OUTPUT_TTY,
235 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
236 EXEC_OUTPUT_KMSG_AND_CONSOLE,
237 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
238 }
239
240 static bool is_syslog_output(ExecOutput o) {
241 return IN_SET(o,
242 EXEC_OUTPUT_SYSLOG,
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
244 }
245
246 static bool is_kmsg_output(ExecOutput o) {
247 return IN_SET(o,
248 EXEC_OUTPUT_KMSG,
249 EXEC_OUTPUT_KMSG_AND_CONSOLE);
250 }
251
252 static bool exec_context_needs_term(const ExecContext *c) {
253 assert(c);
254
255 /* Return true if the execution context suggests we should set $TERM to something useful. */
256
257 if (is_terminal_input(c->std_input))
258 return true;
259
260 if (is_terminal_output(c->std_output))
261 return true;
262
263 if (is_terminal_output(c->std_error))
264 return true;
265
266 return !!c->tty_path;
267 }
268
269 static int open_null_as(int flags, int nfd) {
270 int fd;
271
272 assert(nfd >= 0);
273
274 fd = open("/dev/null", flags|O_NOCTTY);
275 if (fd < 0)
276 return -errno;
277
278 return move_fd(fd, nfd, false);
279 }
280
281 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
282 static const union sockaddr_union sa = {
283 .un.sun_family = AF_UNIX,
284 .un.sun_path = "/run/systemd/journal/stdout",
285 };
286 uid_t olduid = UID_INVALID;
287 gid_t oldgid = GID_INVALID;
288 int r;
289
290 if (gid_is_valid(gid)) {
291 oldgid = getgid();
292
293 if (setegid(gid) < 0)
294 return -errno;
295 }
296
297 if (uid_is_valid(uid)) {
298 olduid = getuid();
299
300 if (seteuid(uid) < 0) {
301 r = -errno;
302 goto restore_gid;
303 }
304 }
305
306 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
307
308 /* If we fail to restore the uid or gid, things will likely
309 fail later on. This should only happen if an LSM interferes. */
310
311 if (uid_is_valid(uid))
312 (void) seteuid(olduid);
313
314 restore_gid:
315 if (gid_is_valid(gid))
316 (void) setegid(oldgid);
317
318 return r;
319 }
320
321 static int connect_logger_as(
322 const Unit *unit,
323 const ExecContext *context,
324 const ExecParameters *params,
325 ExecOutput output,
326 const char *ident,
327 int nfd,
328 uid_t uid,
329 gid_t gid) {
330
331 int fd, r;
332
333 assert(context);
334 assert(params);
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
338
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
341 return -errno;
342
343 r = connect_journal_socket(fd, uid, gid);
344 if (r < 0)
345 return r;
346
347 if (shutdown(fd, SHUT_RD) < 0) {
348 safe_close(fd);
349 return -errno;
350 }
351
352 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
353
354 dprintf(fd,
355 "%s\n"
356 "%s\n"
357 "%i\n"
358 "%i\n"
359 "%i\n"
360 "%i\n"
361 "%i\n",
362 context->syslog_identifier ?: ident,
363 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
364 context->syslog_priority,
365 !!context->syslog_level_prefix,
366 is_syslog_output(output),
367 is_kmsg_output(output),
368 is_terminal_output(output));
369
370 return move_fd(fd, nfd, false);
371 }
372 static int open_terminal_as(const char *path, int flags, int nfd) {
373 int fd;
374
375 assert(path);
376 assert(nfd >= 0);
377
378 fd = open_terminal(path, flags | O_NOCTTY);
379 if (fd < 0)
380 return fd;
381
382 return move_fd(fd, nfd, false);
383 }
384
385 static int acquire_path(const char *path, int flags, mode_t mode) {
386 union sockaddr_union sa = {
387 .sa.sa_family = AF_UNIX,
388 };
389 int fd, r;
390
391 assert(path);
392
393 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
394 flags |= O_CREAT;
395
396 fd = open(path, flags|O_NOCTTY, mode);
397 if (fd >= 0)
398 return fd;
399
400 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
401 return -errno;
402 if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
403 return -ENXIO;
404
405 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
406
407 fd = socket(AF_UNIX, SOCK_STREAM, 0);
408 if (fd < 0)
409 return -errno;
410
411 strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
412 if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
413 safe_close(fd);
414 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
415 * indication that his wasn't an AF_UNIX socket after all */
416 }
417
418 if ((flags & O_ACCMODE) == O_RDONLY)
419 r = shutdown(fd, SHUT_WR);
420 else if ((flags & O_ACCMODE) == O_WRONLY)
421 r = shutdown(fd, SHUT_RD);
422 else
423 return fd;
424 if (r < 0) {
425 safe_close(fd);
426 return -errno;
427 }
428
429 return fd;
430 }
431
432 static int fixup_input(
433 const ExecContext *context,
434 int socket_fd,
435 bool apply_tty_stdin) {
436
437 ExecInput std_input;
438
439 assert(context);
440
441 std_input = context->std_input;
442
443 if (is_terminal_input(std_input) && !apply_tty_stdin)
444 return EXEC_INPUT_NULL;
445
446 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
447 return EXEC_INPUT_NULL;
448
449 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
450 return EXEC_INPUT_NULL;
451
452 return std_input;
453 }
454
455 static int fixup_output(ExecOutput std_output, int socket_fd) {
456
457 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
458 return EXEC_OUTPUT_INHERIT;
459
460 return std_output;
461 }
462
463 static int setup_input(
464 const ExecContext *context,
465 const ExecParameters *params,
466 int socket_fd,
467 int named_iofds[3]) {
468
469 ExecInput i;
470
471 assert(context);
472 assert(params);
473
474 if (params->stdin_fd >= 0) {
475 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
476 return -errno;
477
478 /* Try to make this the controlling tty, if it is a tty, and reset it */
479 if (isatty(STDIN_FILENO)) {
480 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
481 (void) reset_terminal_fd(STDIN_FILENO, true);
482 }
483
484 return STDIN_FILENO;
485 }
486
487 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
488
489 switch (i) {
490
491 case EXEC_INPUT_NULL:
492 return open_null_as(O_RDONLY, STDIN_FILENO);
493
494 case EXEC_INPUT_TTY:
495 case EXEC_INPUT_TTY_FORCE:
496 case EXEC_INPUT_TTY_FAIL: {
497 int fd;
498
499 fd = acquire_terminal(exec_context_tty_path(context),
500 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
501 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
502 ACQUIRE_TERMINAL_WAIT,
503 USEC_INFINITY);
504 if (fd < 0)
505 return fd;
506
507 return move_fd(fd, STDIN_FILENO, false);
508 }
509
510 case EXEC_INPUT_SOCKET:
511 assert(socket_fd >= 0);
512
513 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
514
515 case EXEC_INPUT_NAMED_FD:
516 assert(named_iofds[STDIN_FILENO] >= 0);
517
518 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
519 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
520
521 case EXEC_INPUT_DATA: {
522 int fd;
523
524 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
525 if (fd < 0)
526 return fd;
527
528 return move_fd(fd, STDIN_FILENO, false);
529 }
530
531 case EXEC_INPUT_FILE: {
532 bool rw;
533 int fd;
534
535 assert(context->stdio_file[STDIN_FILENO]);
536
537 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
538 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
539
540 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
541 if (fd < 0)
542 return fd;
543
544 return move_fd(fd, STDIN_FILENO, false);
545 }
546
547 default:
548 assert_not_reached("Unknown input type");
549 }
550 }
551
552 static int setup_output(
553 const Unit *unit,
554 const ExecContext *context,
555 const ExecParameters *params,
556 int fileno,
557 int socket_fd,
558 int named_iofds[3],
559 const char *ident,
560 uid_t uid,
561 gid_t gid,
562 dev_t *journal_stream_dev,
563 ino_t *journal_stream_ino) {
564
565 ExecOutput o;
566 ExecInput i;
567 int r;
568
569 assert(unit);
570 assert(context);
571 assert(params);
572 assert(ident);
573 assert(journal_stream_dev);
574 assert(journal_stream_ino);
575
576 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
577
578 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
579 return -errno;
580
581 return STDOUT_FILENO;
582 }
583
584 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
585 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
586 return -errno;
587
588 return STDERR_FILENO;
589 }
590
591 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
592 o = fixup_output(context->std_output, socket_fd);
593
594 if (fileno == STDERR_FILENO) {
595 ExecOutput e;
596 e = fixup_output(context->std_error, socket_fd);
597
598 /* This expects the input and output are already set up */
599
600 /* Don't change the stderr file descriptor if we inherit all
601 * the way and are not on a tty */
602 if (e == EXEC_OUTPUT_INHERIT &&
603 o == EXEC_OUTPUT_INHERIT &&
604 i == EXEC_INPUT_NULL &&
605 !is_terminal_input(context->std_input) &&
606 getppid () != 1)
607 return fileno;
608
609 /* Duplicate from stdout if possible */
610 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
611 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
612
613 o = e;
614
615 } else if (o == EXEC_OUTPUT_INHERIT) {
616 /* If input got downgraded, inherit the original value */
617 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
618 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
619
620 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
621 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
622 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
623
624 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
625 if (getppid() != 1)
626 return fileno;
627
628 /* We need to open /dev/null here anew, to get the right access mode. */
629 return open_null_as(O_WRONLY, fileno);
630 }
631
632 switch (o) {
633
634 case EXEC_OUTPUT_NULL:
635 return open_null_as(O_WRONLY, fileno);
636
637 case EXEC_OUTPUT_TTY:
638 if (is_terminal_input(i))
639 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
640
641 /* We don't reset the terminal if this is just about output */
642 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
643
644 case EXEC_OUTPUT_SYSLOG:
645 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
646 case EXEC_OUTPUT_KMSG:
647 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
648 case EXEC_OUTPUT_JOURNAL:
649 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
650 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
651 if (r < 0) {
652 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
653 r = open_null_as(O_WRONLY, fileno);
654 } else {
655 struct stat st;
656
657 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
658 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
659 * services to detect whether they are connected to the journal or not.
660 *
661 * If both stdout and stderr are connected to a stream then let's make sure to store the data
662 * about STDERR as that's usually the best way to do logging. */
663
664 if (fstat(fileno, &st) >= 0 &&
665 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
666 *journal_stream_dev = st.st_dev;
667 *journal_stream_ino = st.st_ino;
668 }
669 }
670 return r;
671
672 case EXEC_OUTPUT_SOCKET:
673 assert(socket_fd >= 0);
674
675 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
676
677 case EXEC_OUTPUT_NAMED_FD:
678 assert(named_iofds[fileno] >= 0);
679
680 (void) fd_nonblock(named_iofds[fileno], false);
681 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
682
683 case EXEC_OUTPUT_FILE: {
684 bool rw;
685 int fd;
686
687 assert(context->stdio_file[fileno]);
688
689 rw = context->std_input == EXEC_INPUT_FILE &&
690 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
691
692 if (rw)
693 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
694
695 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
696 if (fd < 0)
697 return fd;
698
699 return move_fd(fd, fileno, false);
700 }
701
702 default:
703 assert_not_reached("Unknown error type");
704 }
705 }
706
707 static int chown_terminal(int fd, uid_t uid) {
708 struct stat st;
709
710 assert(fd >= 0);
711
712 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
713 if (isatty(fd) < 1)
714 return 0;
715
716 /* This might fail. What matters are the results. */
717 (void) fchown(fd, uid, -1);
718 (void) fchmod(fd, TTY_MODE);
719
720 if (fstat(fd, &st) < 0)
721 return -errno;
722
723 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
724 return -EPERM;
725
726 return 0;
727 }
728
729 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
730 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
731 int r;
732
733 assert(_saved_stdin);
734 assert(_saved_stdout);
735
736 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
737 if (saved_stdin < 0)
738 return -errno;
739
740 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
741 if (saved_stdout < 0)
742 return -errno;
743
744 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
745 if (fd < 0)
746 return fd;
747
748 r = chown_terminal(fd, getuid());
749 if (r < 0)
750 return r;
751
752 r = reset_terminal_fd(fd, true);
753 if (r < 0)
754 return r;
755
756 r = rearrange_stdio(fd, fd, STDERR_FILENO);
757 fd = -1;
758 if (r < 0)
759 return r;
760
761 *_saved_stdin = saved_stdin;
762 *_saved_stdout = saved_stdout;
763
764 saved_stdin = saved_stdout = -1;
765
766 return 0;
767 }
768
769 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
770 assert(err < 0);
771
772 if (err == -ETIMEDOUT)
773 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
774 else {
775 errno = -err;
776 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
777 }
778 }
779
780 static void write_confirm_error(int err, const char *vc, const Unit *u) {
781 _cleanup_close_ int fd = -1;
782
783 assert(vc);
784
785 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
786 if (fd < 0)
787 return;
788
789 write_confirm_error_fd(err, fd, u);
790 }
791
792 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
793 int r = 0;
794
795 assert(saved_stdin);
796 assert(saved_stdout);
797
798 release_terminal();
799
800 if (*saved_stdin >= 0)
801 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
802 r = -errno;
803
804 if (*saved_stdout >= 0)
805 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
806 r = -errno;
807
808 *saved_stdin = safe_close(*saved_stdin);
809 *saved_stdout = safe_close(*saved_stdout);
810
811 return r;
812 }
813
814 enum {
815 CONFIRM_PRETEND_FAILURE = -1,
816 CONFIRM_PRETEND_SUCCESS = 0,
817 CONFIRM_EXECUTE = 1,
818 };
819
820 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
821 int saved_stdout = -1, saved_stdin = -1, r;
822 _cleanup_free_ char *e = NULL;
823 char c;
824
825 /* For any internal errors, assume a positive response. */
826 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
827 if (r < 0) {
828 write_confirm_error(r, vc, u);
829 return CONFIRM_EXECUTE;
830 }
831
832 /* confirm_spawn might have been disabled while we were sleeping. */
833 if (manager_is_confirm_spawn_disabled(u->manager)) {
834 r = 1;
835 goto restore_stdio;
836 }
837
838 e = ellipsize(cmdline, 60, 100);
839 if (!e) {
840 log_oom();
841 r = CONFIRM_EXECUTE;
842 goto restore_stdio;
843 }
844
845 for (;;) {
846 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
847 if (r < 0) {
848 write_confirm_error_fd(r, STDOUT_FILENO, u);
849 r = CONFIRM_EXECUTE;
850 goto restore_stdio;
851 }
852
853 switch (c) {
854 case 'c':
855 printf("Resuming normal execution.\n");
856 manager_disable_confirm_spawn();
857 r = 1;
858 break;
859 case 'D':
860 unit_dump(u, stdout, " ");
861 continue; /* ask again */
862 case 'f':
863 printf("Failing execution.\n");
864 r = CONFIRM_PRETEND_FAILURE;
865 break;
866 case 'h':
867 printf(" c - continue, proceed without asking anymore\n"
868 " D - dump, show the state of the unit\n"
869 " f - fail, don't execute the command and pretend it failed\n"
870 " h - help\n"
871 " i - info, show a short summary of the unit\n"
872 " j - jobs, show jobs that are in progress\n"
873 " s - skip, don't execute the command and pretend it succeeded\n"
874 " y - yes, execute the command\n");
875 continue; /* ask again */
876 case 'i':
877 printf(" Description: %s\n"
878 " Unit: %s\n"
879 " Command: %s\n",
880 u->id, u->description, cmdline);
881 continue; /* ask again */
882 case 'j':
883 manager_dump_jobs(u->manager, stdout, " ");
884 continue; /* ask again */
885 case 'n':
886 /* 'n' was removed in favor of 'f'. */
887 printf("Didn't understand 'n', did you mean 'f'?\n");
888 continue; /* ask again */
889 case 's':
890 printf("Skipping execution.\n");
891 r = CONFIRM_PRETEND_SUCCESS;
892 break;
893 case 'y':
894 r = CONFIRM_EXECUTE;
895 break;
896 default:
897 assert_not_reached("Unhandled choice");
898 }
899 break;
900 }
901
902 restore_stdio:
903 restore_confirm_stdio(&saved_stdin, &saved_stdout);
904 return r;
905 }
906
907 static int get_fixed_user(const ExecContext *c, const char **user,
908 uid_t *uid, gid_t *gid,
909 const char **home, const char **shell) {
910 int r;
911 const char *name;
912
913 assert(c);
914
915 if (!c->user)
916 return 0;
917
918 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
919 * (i.e. are "/" or "/bin/nologin"). */
920
921 name = c->user;
922 r = get_user_creds_clean(&name, uid, gid, home, shell);
923 if (r < 0)
924 return r;
925
926 *user = name;
927 return 0;
928 }
929
930 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
931 int r;
932 const char *name;
933
934 assert(c);
935
936 if (!c->group)
937 return 0;
938
939 name = c->group;
940 r = get_group_creds(&name, gid);
941 if (r < 0)
942 return r;
943
944 *group = name;
945 return 0;
946 }
947
948 static int get_supplementary_groups(const ExecContext *c, const char *user,
949 const char *group, gid_t gid,
950 gid_t **supplementary_gids, int *ngids) {
951 char **i;
952 int r, k = 0;
953 int ngroups_max;
954 bool keep_groups = false;
955 gid_t *groups = NULL;
956 _cleanup_free_ gid_t *l_gids = NULL;
957
958 assert(c);
959
960 /*
961 * If user is given, then lookup GID and supplementary groups list.
962 * We avoid NSS lookups for gid=0. Also we have to initialize groups
963 * here and as early as possible so we keep the list of supplementary
964 * groups of the caller.
965 */
966 if (user && gid_is_valid(gid) && gid != 0) {
967 /* First step, initialize groups from /etc/groups */
968 if (initgroups(user, gid) < 0)
969 return -errno;
970
971 keep_groups = true;
972 }
973
974 if (strv_isempty(c->supplementary_groups))
975 return 0;
976
977 /*
978 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
979 * be positive, otherwise fail.
980 */
981 errno = 0;
982 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
983 if (ngroups_max <= 0) {
984 if (errno > 0)
985 return -errno;
986 else
987 return -EOPNOTSUPP; /* For all other values */
988 }
989
990 l_gids = new(gid_t, ngroups_max);
991 if (!l_gids)
992 return -ENOMEM;
993
994 if (keep_groups) {
995 /*
996 * Lookup the list of groups that the user belongs to, we
997 * avoid NSS lookups here too for gid=0.
998 */
999 k = ngroups_max;
1000 if (getgrouplist(user, gid, l_gids, &k) < 0)
1001 return -EINVAL;
1002 } else
1003 k = 0;
1004
1005 STRV_FOREACH(i, c->supplementary_groups) {
1006 const char *g;
1007
1008 if (k >= ngroups_max)
1009 return -E2BIG;
1010
1011 g = *i;
1012 r = get_group_creds(&g, l_gids+k);
1013 if (r < 0)
1014 return r;
1015
1016 k++;
1017 }
1018
1019 /*
1020 * Sets ngids to zero to drop all supplementary groups, happens
1021 * when we are under root and SupplementaryGroups= is empty.
1022 */
1023 if (k == 0) {
1024 *ngids = 0;
1025 return 0;
1026 }
1027
1028 /* Otherwise get the final list of supplementary groups */
1029 groups = memdup(l_gids, sizeof(gid_t) * k);
1030 if (!groups)
1031 return -ENOMEM;
1032
1033 *supplementary_gids = groups;
1034 *ngids = k;
1035
1036 groups = NULL;
1037
1038 return 0;
1039 }
1040
1041 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1042 int r;
1043
1044 /* Handle SupplementaryGroups= if it is not empty */
1045 if (ngids > 0) {
1046 r = maybe_setgroups(ngids, supplementary_gids);
1047 if (r < 0)
1048 return r;
1049 }
1050
1051 if (gid_is_valid(gid)) {
1052 /* Then set our gids */
1053 if (setresgid(gid, gid, gid) < 0)
1054 return -errno;
1055 }
1056
1057 return 0;
1058 }
1059
1060 static int enforce_user(const ExecContext *context, uid_t uid) {
1061 assert(context);
1062
1063 if (!uid_is_valid(uid))
1064 return 0;
1065
1066 /* Sets (but doesn't look up) the uid and make sure we keep the
1067 * capabilities while doing so. */
1068
1069 if (context->capability_ambient_set != 0) {
1070
1071 /* First step: If we need to keep capabilities but
1072 * drop privileges we need to make sure we keep our
1073 * caps, while we drop privileges. */
1074 if (uid != 0) {
1075 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1076
1077 if (prctl(PR_GET_SECUREBITS) != sb)
1078 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1079 return -errno;
1080 }
1081 }
1082
1083 /* Second step: actually set the uids */
1084 if (setresuid(uid, uid, uid) < 0)
1085 return -errno;
1086
1087 /* At this point we should have all necessary capabilities but
1088 are otherwise a normal user. However, the caps might got
1089 corrupted due to the setresuid() so we need clean them up
1090 later. This is done outside of this call. */
1091
1092 return 0;
1093 }
1094
1095 #if HAVE_PAM
1096
1097 static int null_conv(
1098 int num_msg,
1099 const struct pam_message **msg,
1100 struct pam_response **resp,
1101 void *appdata_ptr) {
1102
1103 /* We don't support conversations */
1104
1105 return PAM_CONV_ERR;
1106 }
1107
1108 #endif
1109
1110 static int setup_pam(
1111 const char *name,
1112 const char *user,
1113 uid_t uid,
1114 gid_t gid,
1115 const char *tty,
1116 char ***env,
1117 int fds[], size_t n_fds) {
1118
1119 #if HAVE_PAM
1120
1121 static const struct pam_conv conv = {
1122 .conv = null_conv,
1123 .appdata_ptr = NULL
1124 };
1125
1126 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1127 pam_handle_t *handle = NULL;
1128 sigset_t old_ss;
1129 int pam_code = PAM_SUCCESS, r;
1130 char **nv, **e = NULL;
1131 bool close_session = false;
1132 pid_t pam_pid = 0, parent_pid;
1133 int flags = 0;
1134
1135 assert(name);
1136 assert(user);
1137 assert(env);
1138
1139 /* We set up PAM in the parent process, then fork. The child
1140 * will then stay around until killed via PR_GET_PDEATHSIG or
1141 * systemd via the cgroup logic. It will then remove the PAM
1142 * session again. The parent process will exec() the actual
1143 * daemon. We do things this way to ensure that the main PID
1144 * of the daemon is the one we initially fork()ed. */
1145
1146 r = barrier_create(&barrier);
1147 if (r < 0)
1148 goto fail;
1149
1150 if (log_get_max_level() < LOG_DEBUG)
1151 flags |= PAM_SILENT;
1152
1153 pam_code = pam_start(name, user, &conv, &handle);
1154 if (pam_code != PAM_SUCCESS) {
1155 handle = NULL;
1156 goto fail;
1157 }
1158
1159 if (tty) {
1160 pam_code = pam_set_item(handle, PAM_TTY, tty);
1161 if (pam_code != PAM_SUCCESS)
1162 goto fail;
1163 }
1164
1165 STRV_FOREACH(nv, *env) {
1166 pam_code = pam_putenv(handle, *nv);
1167 if (pam_code != PAM_SUCCESS)
1168 goto fail;
1169 }
1170
1171 pam_code = pam_acct_mgmt(handle, flags);
1172 if (pam_code != PAM_SUCCESS)
1173 goto fail;
1174
1175 pam_code = pam_open_session(handle, flags);
1176 if (pam_code != PAM_SUCCESS)
1177 goto fail;
1178
1179 close_session = true;
1180
1181 e = pam_getenvlist(handle);
1182 if (!e) {
1183 pam_code = PAM_BUF_ERR;
1184 goto fail;
1185 }
1186
1187 /* Block SIGTERM, so that we know that it won't get lost in
1188 * the child */
1189
1190 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1191
1192 parent_pid = getpid_cached();
1193
1194 r = safe_fork("(sd-pam)", 0, &pam_pid);
1195 if (r < 0)
1196 goto fail;
1197 if (r == 0) {
1198 int sig, ret = EXIT_PAM;
1199
1200 /* The child's job is to reset the PAM session on
1201 * termination */
1202 barrier_set_role(&barrier, BARRIER_CHILD);
1203
1204 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1205 * are open here that have been opened by PAM. */
1206 (void) close_many(fds, n_fds);
1207
1208 /* Drop privileges - we don't need any to pam_close_session
1209 * and this will make PR_SET_PDEATHSIG work in most cases.
1210 * If this fails, ignore the error - but expect sd-pam threads
1211 * to fail to exit normally */
1212
1213 r = maybe_setgroups(0, NULL);
1214 if (r < 0)
1215 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1216 if (setresgid(gid, gid, gid) < 0)
1217 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1218 if (setresuid(uid, uid, uid) < 0)
1219 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1220
1221 (void) ignore_signals(SIGPIPE, -1);
1222
1223 /* Wait until our parent died. This will only work if
1224 * the above setresuid() succeeds, otherwise the kernel
1225 * will not allow unprivileged parents kill their privileged
1226 * children this way. We rely on the control groups kill logic
1227 * to do the rest for us. */
1228 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1229 goto child_finish;
1230
1231 /* Tell the parent that our setup is done. This is especially
1232 * important regarding dropping privileges. Otherwise, unit
1233 * setup might race against our setresuid(2) call.
1234 *
1235 * If the parent aborted, we'll detect this below, hence ignore
1236 * return failure here. */
1237 (void) barrier_place(&barrier);
1238
1239 /* Check if our parent process might already have died? */
1240 if (getppid() == parent_pid) {
1241 sigset_t ss;
1242
1243 assert_se(sigemptyset(&ss) >= 0);
1244 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1245
1246 for (;;) {
1247 if (sigwait(&ss, &sig) < 0) {
1248 if (errno == EINTR)
1249 continue;
1250
1251 goto child_finish;
1252 }
1253
1254 assert(sig == SIGTERM);
1255 break;
1256 }
1257 }
1258
1259 /* If our parent died we'll end the session */
1260 if (getppid() != parent_pid) {
1261 pam_code = pam_close_session(handle, flags);
1262 if (pam_code != PAM_SUCCESS)
1263 goto child_finish;
1264 }
1265
1266 ret = 0;
1267
1268 child_finish:
1269 pam_end(handle, pam_code | flags);
1270 _exit(ret);
1271 }
1272
1273 barrier_set_role(&barrier, BARRIER_PARENT);
1274
1275 /* If the child was forked off successfully it will do all the
1276 * cleanups, so forget about the handle here. */
1277 handle = NULL;
1278
1279 /* Unblock SIGTERM again in the parent */
1280 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1281
1282 /* We close the log explicitly here, since the PAM modules
1283 * might have opened it, but we don't want this fd around. */
1284 closelog();
1285
1286 /* Synchronously wait for the child to initialize. We don't care for
1287 * errors as we cannot recover. However, warn loudly if it happens. */
1288 if (!barrier_place_and_sync(&barrier))
1289 log_error("PAM initialization failed");
1290
1291 return strv_free_and_replace(*env, e);
1292
1293 fail:
1294 if (pam_code != PAM_SUCCESS) {
1295 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1296 r = -EPERM; /* PAM errors do not map to errno */
1297 } else
1298 log_error_errno(r, "PAM failed: %m");
1299
1300 if (handle) {
1301 if (close_session)
1302 pam_code = pam_close_session(handle, flags);
1303
1304 pam_end(handle, pam_code | flags);
1305 }
1306
1307 strv_free(e);
1308 closelog();
1309
1310 return r;
1311 #else
1312 return 0;
1313 #endif
1314 }
1315
1316 static void rename_process_from_path(const char *path) {
1317 char process_name[11];
1318 const char *p;
1319 size_t l;
1320
1321 /* This resulting string must fit in 10 chars (i.e. the length
1322 * of "/sbin/init") to look pretty in /bin/ps */
1323
1324 p = basename(path);
1325 if (isempty(p)) {
1326 rename_process("(...)");
1327 return;
1328 }
1329
1330 l = strlen(p);
1331 if (l > 8) {
1332 /* The end of the process name is usually more
1333 * interesting, since the first bit might just be
1334 * "systemd-" */
1335 p = p + l - 8;
1336 l = 8;
1337 }
1338
1339 process_name[0] = '(';
1340 memcpy(process_name+1, p, l);
1341 process_name[1+l] = ')';
1342 process_name[1+l+1] = 0;
1343
1344 rename_process(process_name);
1345 }
1346
1347 static bool context_has_address_families(const ExecContext *c) {
1348 assert(c);
1349
1350 return c->address_families_whitelist ||
1351 !set_isempty(c->address_families);
1352 }
1353
1354 static bool context_has_syscall_filters(const ExecContext *c) {
1355 assert(c);
1356
1357 return c->syscall_whitelist ||
1358 !hashmap_isempty(c->syscall_filter);
1359 }
1360
1361 static bool context_has_no_new_privileges(const ExecContext *c) {
1362 assert(c);
1363
1364 if (c->no_new_privileges)
1365 return true;
1366
1367 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1368 return false;
1369
1370 /* We need NNP if we have any form of seccomp and are unprivileged */
1371 return context_has_address_families(c) ||
1372 c->memory_deny_write_execute ||
1373 c->restrict_realtime ||
1374 exec_context_restrict_namespaces_set(c) ||
1375 c->protect_kernel_tunables ||
1376 c->protect_kernel_modules ||
1377 c->private_devices ||
1378 context_has_syscall_filters(c) ||
1379 !set_isempty(c->syscall_archs) ||
1380 c->lock_personality;
1381 }
1382
1383 #if HAVE_SECCOMP
1384
1385 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1386
1387 if (is_seccomp_available())
1388 return false;
1389
1390 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1391 return true;
1392 }
1393
1394 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1395 uint32_t negative_action, default_action, action;
1396 int r;
1397
1398 assert(u);
1399 assert(c);
1400
1401 if (!context_has_syscall_filters(c))
1402 return 0;
1403
1404 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1405 return 0;
1406
1407 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1408
1409 if (c->syscall_whitelist) {
1410 default_action = negative_action;
1411 action = SCMP_ACT_ALLOW;
1412 } else {
1413 default_action = SCMP_ACT_ALLOW;
1414 action = negative_action;
1415 }
1416
1417 if (needs_ambient_hack) {
1418 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1419 if (r < 0)
1420 return r;
1421 }
1422
1423 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1424 }
1425
1426 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1427 assert(u);
1428 assert(c);
1429
1430 if (set_isempty(c->syscall_archs))
1431 return 0;
1432
1433 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1434 return 0;
1435
1436 return seccomp_restrict_archs(c->syscall_archs);
1437 }
1438
1439 static int apply_address_families(const Unit* u, const ExecContext *c) {
1440 assert(u);
1441 assert(c);
1442
1443 if (!context_has_address_families(c))
1444 return 0;
1445
1446 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1447 return 0;
1448
1449 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1450 }
1451
1452 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1453 assert(u);
1454 assert(c);
1455
1456 if (!c->memory_deny_write_execute)
1457 return 0;
1458
1459 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1460 return 0;
1461
1462 return seccomp_memory_deny_write_execute();
1463 }
1464
1465 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1466 assert(u);
1467 assert(c);
1468
1469 if (!c->restrict_realtime)
1470 return 0;
1471
1472 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1473 return 0;
1474
1475 return seccomp_restrict_realtime();
1476 }
1477
1478 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1479 assert(u);
1480 assert(c);
1481
1482 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1483 * let's protect even those systems where this is left on in the kernel. */
1484
1485 if (!c->protect_kernel_tunables)
1486 return 0;
1487
1488 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1489 return 0;
1490
1491 return seccomp_protect_sysctl();
1492 }
1493
1494 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1495 assert(u);
1496 assert(c);
1497
1498 /* Turn off module syscalls on ProtectKernelModules=yes */
1499
1500 if (!c->protect_kernel_modules)
1501 return 0;
1502
1503 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1504 return 0;
1505
1506 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1507 }
1508
1509 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1510 assert(u);
1511 assert(c);
1512
1513 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1514
1515 if (!c->private_devices)
1516 return 0;
1517
1518 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1519 return 0;
1520
1521 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1522 }
1523
1524 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1525 assert(u);
1526 assert(c);
1527
1528 if (!exec_context_restrict_namespaces_set(c))
1529 return 0;
1530
1531 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1532 return 0;
1533
1534 return seccomp_restrict_namespaces(c->restrict_namespaces);
1535 }
1536
1537 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1538 unsigned long personality;
1539 int r;
1540
1541 assert(u);
1542 assert(c);
1543
1544 if (!c->lock_personality)
1545 return 0;
1546
1547 if (skip_seccomp_unavailable(u, "LockPersonality="))
1548 return 0;
1549
1550 personality = c->personality;
1551
1552 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1553 if (personality == PERSONALITY_INVALID) {
1554
1555 r = opinionated_personality(&personality);
1556 if (r < 0)
1557 return r;
1558 }
1559
1560 return seccomp_lock_personality(personality);
1561 }
1562
1563 #endif
1564
1565 static void do_idle_pipe_dance(int idle_pipe[4]) {
1566 assert(idle_pipe);
1567
1568 idle_pipe[1] = safe_close(idle_pipe[1]);
1569 idle_pipe[2] = safe_close(idle_pipe[2]);
1570
1571 if (idle_pipe[0] >= 0) {
1572 int r;
1573
1574 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1575
1576 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1577 ssize_t n;
1578
1579 /* Signal systemd that we are bored and want to continue. */
1580 n = write(idle_pipe[3], "x", 1);
1581 if (n > 0)
1582 /* Wait for systemd to react to the signal above. */
1583 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1584 }
1585
1586 idle_pipe[0] = safe_close(idle_pipe[0]);
1587
1588 }
1589
1590 idle_pipe[3] = safe_close(idle_pipe[3]);
1591 }
1592
1593 static int build_environment(
1594 const Unit *u,
1595 const ExecContext *c,
1596 const ExecParameters *p,
1597 size_t n_fds,
1598 const char *home,
1599 const char *username,
1600 const char *shell,
1601 dev_t journal_stream_dev,
1602 ino_t journal_stream_ino,
1603 char ***ret) {
1604
1605 _cleanup_strv_free_ char **our_env = NULL;
1606 size_t n_env = 0;
1607 char *x;
1608
1609 assert(u);
1610 assert(c);
1611 assert(ret);
1612
1613 our_env = new0(char*, 14);
1614 if (!our_env)
1615 return -ENOMEM;
1616
1617 if (n_fds > 0) {
1618 _cleanup_free_ char *joined = NULL;
1619
1620 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1621 return -ENOMEM;
1622 our_env[n_env++] = x;
1623
1624 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1625 return -ENOMEM;
1626 our_env[n_env++] = x;
1627
1628 joined = strv_join(p->fd_names, ":");
1629 if (!joined)
1630 return -ENOMEM;
1631
1632 x = strjoin("LISTEN_FDNAMES=", joined);
1633 if (!x)
1634 return -ENOMEM;
1635 our_env[n_env++] = x;
1636 }
1637
1638 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1639 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1640 return -ENOMEM;
1641 our_env[n_env++] = x;
1642
1643 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1644 return -ENOMEM;
1645 our_env[n_env++] = x;
1646 }
1647
1648 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1649 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1650 * check the database directly. */
1651 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1652 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1653 if (!x)
1654 return -ENOMEM;
1655 our_env[n_env++] = x;
1656 }
1657
1658 if (home) {
1659 x = strappend("HOME=", home);
1660 if (!x)
1661 return -ENOMEM;
1662 our_env[n_env++] = x;
1663 }
1664
1665 if (username) {
1666 x = strappend("LOGNAME=", username);
1667 if (!x)
1668 return -ENOMEM;
1669 our_env[n_env++] = x;
1670
1671 x = strappend("USER=", username);
1672 if (!x)
1673 return -ENOMEM;
1674 our_env[n_env++] = x;
1675 }
1676
1677 if (shell) {
1678 x = strappend("SHELL=", shell);
1679 if (!x)
1680 return -ENOMEM;
1681 our_env[n_env++] = x;
1682 }
1683
1684 if (!sd_id128_is_null(u->invocation_id)) {
1685 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1686 return -ENOMEM;
1687
1688 our_env[n_env++] = x;
1689 }
1690
1691 if (exec_context_needs_term(c)) {
1692 const char *tty_path, *term = NULL;
1693
1694 tty_path = exec_context_tty_path(c);
1695
1696 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1697 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1698 * passes to PID 1 ends up all the way in the console login shown. */
1699
1700 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1701 term = getenv("TERM");
1702 if (!term)
1703 term = default_term_for_tty(tty_path);
1704
1705 x = strappend("TERM=", term);
1706 if (!x)
1707 return -ENOMEM;
1708 our_env[n_env++] = x;
1709 }
1710
1711 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1712 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1713 return -ENOMEM;
1714
1715 our_env[n_env++] = x;
1716 }
1717
1718 our_env[n_env++] = NULL;
1719 assert(n_env <= 12);
1720
1721 *ret = TAKE_PTR(our_env);
1722
1723 return 0;
1724 }
1725
1726 static int build_pass_environment(const ExecContext *c, char ***ret) {
1727 _cleanup_strv_free_ char **pass_env = NULL;
1728 size_t n_env = 0, n_bufsize = 0;
1729 char **i;
1730
1731 STRV_FOREACH(i, c->pass_environment) {
1732 _cleanup_free_ char *x = NULL;
1733 char *v;
1734
1735 v = getenv(*i);
1736 if (!v)
1737 continue;
1738 x = strjoin(*i, "=", v);
1739 if (!x)
1740 return -ENOMEM;
1741
1742 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1743 return -ENOMEM;
1744
1745 pass_env[n_env++] = TAKE_PTR(x);
1746 pass_env[n_env] = NULL;
1747 }
1748
1749 *ret = TAKE_PTR(pass_env);
1750
1751 return 0;
1752 }
1753
1754 static bool exec_needs_mount_namespace(
1755 const ExecContext *context,
1756 const ExecParameters *params,
1757 const ExecRuntime *runtime) {
1758
1759 assert(context);
1760 assert(params);
1761
1762 if (context->root_image)
1763 return true;
1764
1765 if (!strv_isempty(context->read_write_paths) ||
1766 !strv_isempty(context->read_only_paths) ||
1767 !strv_isempty(context->inaccessible_paths))
1768 return true;
1769
1770 if (context->n_bind_mounts > 0)
1771 return true;
1772
1773 if (context->n_temporary_filesystems > 0)
1774 return true;
1775
1776 if (context->mount_flags != 0)
1777 return true;
1778
1779 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1780 return true;
1781
1782 if (context->private_devices ||
1783 context->protect_system != PROTECT_SYSTEM_NO ||
1784 context->protect_home != PROTECT_HOME_NO ||
1785 context->protect_kernel_tunables ||
1786 context->protect_kernel_modules ||
1787 context->protect_control_groups)
1788 return true;
1789
1790 if (context->mount_apivfs && (context->root_image || context->root_directory))
1791 return true;
1792
1793 if (context->dynamic_user &&
1794 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1795 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1796 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1797 return true;
1798
1799 return false;
1800 }
1801
1802 static int setup_private_users(uid_t uid, gid_t gid) {
1803 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1804 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1805 _cleanup_close_ int unshare_ready_fd = -1;
1806 _cleanup_(sigkill_waitp) pid_t pid = 0;
1807 uint64_t c = 1;
1808 ssize_t n;
1809 int r;
1810
1811 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1812 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1813 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1814 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1815 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1816 * continues execution normally. */
1817
1818 if (uid != 0 && uid_is_valid(uid)) {
1819 r = asprintf(&uid_map,
1820 "0 0 1\n" /* Map root → root */
1821 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1822 uid, uid);
1823 if (r < 0)
1824 return -ENOMEM;
1825 } else {
1826 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1827 if (!uid_map)
1828 return -ENOMEM;
1829 }
1830
1831 if (gid != 0 && gid_is_valid(gid)) {
1832 r = asprintf(&gid_map,
1833 "0 0 1\n" /* Map root → root */
1834 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1835 gid, gid);
1836 if (r < 0)
1837 return -ENOMEM;
1838 } else {
1839 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1840 if (!gid_map)
1841 return -ENOMEM;
1842 }
1843
1844 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1845 * namespace. */
1846 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1847 if (unshare_ready_fd < 0)
1848 return -errno;
1849
1850 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1851 * failed. */
1852 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1853 return -errno;
1854
1855 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1856 if (r < 0)
1857 return r;
1858 if (r == 0) {
1859 _cleanup_close_ int fd = -1;
1860 const char *a;
1861 pid_t ppid;
1862
1863 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1864 * here, after the parent opened its own user namespace. */
1865
1866 ppid = getppid();
1867 errno_pipe[0] = safe_close(errno_pipe[0]);
1868
1869 /* Wait until the parent unshared the user namespace */
1870 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1871 r = -errno;
1872 goto child_fail;
1873 }
1874
1875 /* Disable the setgroups() system call in the child user namespace, for good. */
1876 a = procfs_file_alloca(ppid, "setgroups");
1877 fd = open(a, O_WRONLY|O_CLOEXEC);
1878 if (fd < 0) {
1879 if (errno != ENOENT) {
1880 r = -errno;
1881 goto child_fail;
1882 }
1883
1884 /* If the file is missing the kernel is too old, let's continue anyway. */
1885 } else {
1886 if (write(fd, "deny\n", 5) < 0) {
1887 r = -errno;
1888 goto child_fail;
1889 }
1890
1891 fd = safe_close(fd);
1892 }
1893
1894 /* First write the GID map */
1895 a = procfs_file_alloca(ppid, "gid_map");
1896 fd = open(a, O_WRONLY|O_CLOEXEC);
1897 if (fd < 0) {
1898 r = -errno;
1899 goto child_fail;
1900 }
1901 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1902 r = -errno;
1903 goto child_fail;
1904 }
1905 fd = safe_close(fd);
1906
1907 /* The write the UID map */
1908 a = procfs_file_alloca(ppid, "uid_map");
1909 fd = open(a, O_WRONLY|O_CLOEXEC);
1910 if (fd < 0) {
1911 r = -errno;
1912 goto child_fail;
1913 }
1914 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1915 r = -errno;
1916 goto child_fail;
1917 }
1918
1919 _exit(EXIT_SUCCESS);
1920
1921 child_fail:
1922 (void) write(errno_pipe[1], &r, sizeof(r));
1923 _exit(EXIT_FAILURE);
1924 }
1925
1926 errno_pipe[1] = safe_close(errno_pipe[1]);
1927
1928 if (unshare(CLONE_NEWUSER) < 0)
1929 return -errno;
1930
1931 /* Let the child know that the namespace is ready now */
1932 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1933 return -errno;
1934
1935 /* Try to read an error code from the child */
1936 n = read(errno_pipe[0], &r, sizeof(r));
1937 if (n < 0)
1938 return -errno;
1939 if (n == sizeof(r)) { /* an error code was sent to us */
1940 if (r < 0)
1941 return r;
1942 return -EIO;
1943 }
1944 if (n != 0) /* on success we should have read 0 bytes */
1945 return -EIO;
1946
1947 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1948 pid = 0;
1949 if (r < 0)
1950 return r;
1951 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
1952 return -EIO;
1953
1954 return 0;
1955 }
1956
1957 static int setup_exec_directory(
1958 const ExecContext *context,
1959 const ExecParameters *params,
1960 uid_t uid,
1961 gid_t gid,
1962 ExecDirectoryType type,
1963 int *exit_status) {
1964
1965 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1966 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1967 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1968 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1969 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1970 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1971 };
1972 char **rt;
1973 int r;
1974
1975 assert(context);
1976 assert(params);
1977 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1978 assert(exit_status);
1979
1980 if (!params->prefix[type])
1981 return 0;
1982
1983 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1984 if (!uid_is_valid(uid))
1985 uid = 0;
1986 if (!gid_is_valid(gid))
1987 gid = 0;
1988 }
1989
1990 STRV_FOREACH(rt, context->directories[type].paths) {
1991 _cleanup_free_ char *p = NULL, *pp = NULL;
1992
1993 p = strjoin(params->prefix[type], "/", *rt);
1994 if (!p) {
1995 r = -ENOMEM;
1996 goto fail;
1997 }
1998
1999 r = mkdir_parents_label(p, 0755);
2000 if (r < 0)
2001 goto fail;
2002
2003 if (context->dynamic_user &&
2004 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2005 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2006
2007 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2008 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2009 * whose UID is later on reused. To lock this down we use the same trick used by container
2010 * managers to prohibit host users to get access to files of the same UID in containers: we
2011 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2012 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2013 * to make this directory permeable for the service itself.
2014 *
2015 * Specifically: for a service which wants a special directory "foo/" we first create a
2016 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2017 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2018 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2019 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2020 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2021 * disabling the access boundary for the service and making sure it only gets access to the
2022 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2023 *
2024 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2025 * owned by the service itself.
2026 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2027 * files or sockets with other services. */
2028
2029 private_root = strjoin(params->prefix[type], "/private");
2030 if (!private_root) {
2031 r = -ENOMEM;
2032 goto fail;
2033 }
2034
2035 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2036 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2037 if (r < 0)
2038 goto fail;
2039
2040 pp = strjoin(private_root, "/", *rt);
2041 if (!pp) {
2042 r = -ENOMEM;
2043 goto fail;
2044 }
2045
2046 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2047 r = mkdir_parents_label(pp, 0755);
2048 if (r < 0)
2049 goto fail;
2050
2051 if (is_dir(p, false) > 0 &&
2052 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2053
2054 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2055 * it over. Most likely the service has been upgraded from one that didn't use
2056 * DynamicUser=1, to one that does. */
2057
2058 if (rename(p, pp) < 0) {
2059 r = -errno;
2060 goto fail;
2061 }
2062 } else {
2063 /* Otherwise, create the actual directory for the service */
2064
2065 r = mkdir_label(pp, context->directories[type].mode);
2066 if (r < 0 && r != -EEXIST)
2067 goto fail;
2068 }
2069
2070 parent = dirname_malloc(p);
2071 if (!parent) {
2072 r = -ENOMEM;
2073 goto fail;
2074 }
2075
2076 r = path_make_relative(parent, pp, &relative);
2077 if (r < 0)
2078 goto fail;
2079
2080 /* And link it up from the original place */
2081 r = symlink_idempotent(relative, p);
2082 if (r < 0)
2083 goto fail;
2084
2085 /* Lock down the access mode */
2086 if (chmod(pp, context->directories[type].mode) < 0) {
2087 r = -errno;
2088 goto fail;
2089 }
2090 } else {
2091 r = mkdir_label(p, context->directories[type].mode);
2092 if (r < 0 && r != -EEXIST)
2093 goto fail;
2094 if (r == -EEXIST && !context->dynamic_user)
2095 continue;
2096 }
2097
2098 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2099 * a service, and shall not be writable. */
2100 if (type == EXEC_DIRECTORY_CONFIGURATION)
2101 continue;
2102
2103 /* Then, change the ownership of the whole tree, if necessary */
2104 r = path_chown_recursive(pp ?: p, uid, gid);
2105 if (r < 0)
2106 goto fail;
2107 }
2108
2109 return 0;
2110
2111 fail:
2112 *exit_status = exit_status_table[type];
2113 return r;
2114 }
2115
2116 #if ENABLE_SMACK
2117 static int setup_smack(
2118 const ExecContext *context,
2119 const ExecCommand *command) {
2120
2121 int r;
2122
2123 assert(context);
2124 assert(command);
2125
2126 if (context->smack_process_label) {
2127 r = mac_smack_apply_pid(0, context->smack_process_label);
2128 if (r < 0)
2129 return r;
2130 }
2131 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2132 else {
2133 _cleanup_free_ char *exec_label = NULL;
2134
2135 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2136 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2137 return r;
2138
2139 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2140 if (r < 0)
2141 return r;
2142 }
2143 #endif
2144
2145 return 0;
2146 }
2147 #endif
2148
2149 static int compile_bind_mounts(
2150 const ExecContext *context,
2151 const ExecParameters *params,
2152 BindMount **ret_bind_mounts,
2153 size_t *ret_n_bind_mounts,
2154 char ***ret_empty_directories) {
2155
2156 _cleanup_strv_free_ char **empty_directories = NULL;
2157 BindMount *bind_mounts;
2158 size_t n, h = 0, i;
2159 ExecDirectoryType t;
2160 int r;
2161
2162 assert(context);
2163 assert(params);
2164 assert(ret_bind_mounts);
2165 assert(ret_n_bind_mounts);
2166 assert(ret_empty_directories);
2167
2168 n = context->n_bind_mounts;
2169 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2170 if (!params->prefix[t])
2171 continue;
2172
2173 n += strv_length(context->directories[t].paths);
2174 }
2175
2176 if (n <= 0) {
2177 *ret_bind_mounts = NULL;
2178 *ret_n_bind_mounts = 0;
2179 *ret_empty_directories = NULL;
2180 return 0;
2181 }
2182
2183 bind_mounts = new(BindMount, n);
2184 if (!bind_mounts)
2185 return -ENOMEM;
2186
2187 for (i = 0; i < context->n_bind_mounts; i++) {
2188 BindMount *item = context->bind_mounts + i;
2189 char *s, *d;
2190
2191 s = strdup(item->source);
2192 if (!s) {
2193 r = -ENOMEM;
2194 goto finish;
2195 }
2196
2197 d = strdup(item->destination);
2198 if (!d) {
2199 free(s);
2200 r = -ENOMEM;
2201 goto finish;
2202 }
2203
2204 bind_mounts[h++] = (BindMount) {
2205 .source = s,
2206 .destination = d,
2207 .read_only = item->read_only,
2208 .recursive = item->recursive,
2209 .ignore_enoent = item->ignore_enoent,
2210 };
2211 }
2212
2213 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2214 char **suffix;
2215
2216 if (!params->prefix[t])
2217 continue;
2218
2219 if (strv_isempty(context->directories[t].paths))
2220 continue;
2221
2222 if (context->dynamic_user &&
2223 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2224 char *private_root;
2225
2226 /* So this is for a dynamic user, and we need to make sure the process can access its own
2227 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2228 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2229
2230 private_root = strjoin(params->prefix[t], "/private");
2231 if (!private_root) {
2232 r = -ENOMEM;
2233 goto finish;
2234 }
2235
2236 r = strv_consume(&empty_directories, private_root);
2237 if (r < 0)
2238 goto finish;
2239 }
2240
2241 STRV_FOREACH(suffix, context->directories[t].paths) {
2242 char *s, *d;
2243
2244 if (context->dynamic_user &&
2245 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2246 s = strjoin(params->prefix[t], "/private/", *suffix);
2247 else
2248 s = strjoin(params->prefix[t], "/", *suffix);
2249 if (!s) {
2250 r = -ENOMEM;
2251 goto finish;
2252 }
2253
2254 d = strdup(s);
2255 if (!d) {
2256 free(s);
2257 r = -ENOMEM;
2258 goto finish;
2259 }
2260
2261 bind_mounts[h++] = (BindMount) {
2262 .source = s,
2263 .destination = d,
2264 .read_only = false,
2265 .recursive = true,
2266 .ignore_enoent = false,
2267 };
2268 }
2269 }
2270
2271 assert(h == n);
2272
2273 *ret_bind_mounts = bind_mounts;
2274 *ret_n_bind_mounts = n;
2275 *ret_empty_directories = TAKE_PTR(empty_directories);
2276
2277 return (int) n;
2278
2279 finish:
2280 bind_mount_free_many(bind_mounts, h);
2281 return r;
2282 }
2283
2284 static int apply_mount_namespace(
2285 const Unit *u,
2286 const ExecCommand *command,
2287 const ExecContext *context,
2288 const ExecParameters *params,
2289 const ExecRuntime *runtime) {
2290
2291 _cleanup_strv_free_ char **empty_directories = NULL;
2292 char *tmp = NULL, *var = NULL;
2293 const char *root_dir = NULL, *root_image = NULL;
2294 NamespaceInfo ns_info = {};
2295 bool needs_sandboxing;
2296 BindMount *bind_mounts = NULL;
2297 size_t n_bind_mounts = 0;
2298 int r;
2299
2300 assert(context);
2301
2302 /* The runtime struct only contains the parent of the private /tmp,
2303 * which is non-accessible to world users. Inside of it there's a /tmp
2304 * that is sticky, and that's the one we want to use here. */
2305
2306 if (context->private_tmp && runtime) {
2307 if (runtime->tmp_dir)
2308 tmp = strjoina(runtime->tmp_dir, "/tmp");
2309 if (runtime->var_tmp_dir)
2310 var = strjoina(runtime->var_tmp_dir, "/tmp");
2311 }
2312
2313 if (params->flags & EXEC_APPLY_CHROOT) {
2314 root_image = context->root_image;
2315
2316 if (!root_image)
2317 root_dir = context->root_directory;
2318 }
2319
2320 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2321 if (r < 0)
2322 return r;
2323
2324 /*
2325 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2326 * sandbox info, otherwise enforce it, don't ignore protected paths and
2327 * fail if we are enable to apply the sandbox inside the mount namespace.
2328 */
2329 if (!context->dynamic_user && root_dir)
2330 ns_info.ignore_protect_paths = true;
2331
2332 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2333
2334 if (needs_sandboxing)
2335 ns_info = (NamespaceInfo) {
2336 .ignore_protect_paths = false,
2337 .private_dev = context->private_devices,
2338 .protect_control_groups = context->protect_control_groups,
2339 .protect_kernel_tunables = context->protect_kernel_tunables,
2340 .protect_kernel_modules = context->protect_kernel_modules,
2341 .mount_apivfs = context->mount_apivfs,
2342 };
2343
2344 r = setup_namespace(root_dir, root_image,
2345 &ns_info, context->read_write_paths,
2346 needs_sandboxing ? context->read_only_paths : NULL,
2347 needs_sandboxing ? context->inaccessible_paths : NULL,
2348 empty_directories,
2349 bind_mounts,
2350 n_bind_mounts,
2351 context->temporary_filesystems,
2352 context->n_temporary_filesystems,
2353 tmp,
2354 var,
2355 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2356 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2357 context->mount_flags,
2358 DISSECT_IMAGE_DISCARD_ON_LOOP);
2359
2360 bind_mount_free_many(bind_mounts, n_bind_mounts);
2361
2362 /* If we couldn't set up the namespace this is probably due to a
2363 * missing capability. In this case, silently proceeed. */
2364 if (IN_SET(r, -EPERM, -EACCES)) {
2365 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2366 return 0;
2367 }
2368
2369 return r;
2370 }
2371
2372 static int apply_working_directory(
2373 const ExecContext *context,
2374 const ExecParameters *params,
2375 const char *home,
2376 const bool needs_mount_ns,
2377 int *exit_status) {
2378
2379 const char *d, *wd;
2380
2381 assert(context);
2382 assert(exit_status);
2383
2384 if (context->working_directory_home) {
2385
2386 if (!home) {
2387 *exit_status = EXIT_CHDIR;
2388 return -ENXIO;
2389 }
2390
2391 wd = home;
2392
2393 } else if (context->working_directory)
2394 wd = context->working_directory;
2395 else
2396 wd = "/";
2397
2398 if (params->flags & EXEC_APPLY_CHROOT) {
2399 if (!needs_mount_ns && context->root_directory)
2400 if (chroot(context->root_directory) < 0) {
2401 *exit_status = EXIT_CHROOT;
2402 return -errno;
2403 }
2404
2405 d = wd;
2406 } else
2407 d = prefix_roota(context->root_directory, wd);
2408
2409 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2410 *exit_status = EXIT_CHDIR;
2411 return -errno;
2412 }
2413
2414 return 0;
2415 }
2416
2417 static int setup_keyring(
2418 const Unit *u,
2419 const ExecContext *context,
2420 const ExecParameters *p,
2421 uid_t uid, gid_t gid) {
2422
2423 key_serial_t keyring;
2424 int r = 0;
2425 uid_t saved_uid;
2426 gid_t saved_gid;
2427
2428 assert(u);
2429 assert(context);
2430 assert(p);
2431
2432 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2433 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2434 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2435 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2436 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2437 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2438
2439 if (!(p->flags & EXEC_NEW_KEYRING))
2440 return 0;
2441
2442 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2443 return 0;
2444
2445 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2446 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2447 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2448 * & group is just as nasty as acquiring a reference to the user keyring. */
2449
2450 saved_uid = getuid();
2451 saved_gid = getgid();
2452
2453 if (gid_is_valid(gid) && gid != saved_gid) {
2454 if (setregid(gid, -1) < 0)
2455 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2456 }
2457
2458 if (uid_is_valid(uid) && uid != saved_uid) {
2459 if (setreuid(uid, -1) < 0) {
2460 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2461 goto out;
2462 }
2463 }
2464
2465 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2466 if (keyring == -1) {
2467 if (errno == ENOSYS)
2468 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2469 else if (IN_SET(errno, EACCES, EPERM))
2470 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2471 else if (errno == EDQUOT)
2472 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2473 else
2474 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2475
2476 goto out;
2477 }
2478
2479 /* When requested link the user keyring into the session keyring. */
2480 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2481
2482 if (keyctl(KEYCTL_LINK,
2483 KEY_SPEC_USER_KEYRING,
2484 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2485 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2486 goto out;
2487 }
2488 }
2489
2490 /* Restore uid/gid back */
2491 if (uid_is_valid(uid) && uid != saved_uid) {
2492 if (setreuid(saved_uid, -1) < 0) {
2493 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2494 goto out;
2495 }
2496 }
2497
2498 if (gid_is_valid(gid) && gid != saved_gid) {
2499 if (setregid(saved_gid, -1) < 0)
2500 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2501 }
2502
2503 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2504 if (!sd_id128_is_null(u->invocation_id)) {
2505 key_serial_t key;
2506
2507 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2508 if (key == -1)
2509 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2510 else {
2511 if (keyctl(KEYCTL_SETPERM, key,
2512 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2513 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2514 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2515 }
2516 }
2517
2518 out:
2519 /* Revert back uid & gid for the the last time, and exit */
2520 /* no extra logging, as only the first already reported error matters */
2521 if (getuid() != saved_uid)
2522 (void) setreuid(saved_uid, -1);
2523
2524 if (getgid() != saved_gid)
2525 (void) setregid(saved_gid, -1);
2526
2527 return r;
2528 }
2529
2530 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2531 assert(array);
2532 assert(n);
2533
2534 if (!pair)
2535 return;
2536
2537 if (pair[0] >= 0)
2538 array[(*n)++] = pair[0];
2539 if (pair[1] >= 0)
2540 array[(*n)++] = pair[1];
2541 }
2542
2543 static int close_remaining_fds(
2544 const ExecParameters *params,
2545 const ExecRuntime *runtime,
2546 const DynamicCreds *dcreds,
2547 int user_lookup_fd,
2548 int socket_fd,
2549 int *fds, size_t n_fds) {
2550
2551 size_t n_dont_close = 0;
2552 int dont_close[n_fds + 12];
2553
2554 assert(params);
2555
2556 if (params->stdin_fd >= 0)
2557 dont_close[n_dont_close++] = params->stdin_fd;
2558 if (params->stdout_fd >= 0)
2559 dont_close[n_dont_close++] = params->stdout_fd;
2560 if (params->stderr_fd >= 0)
2561 dont_close[n_dont_close++] = params->stderr_fd;
2562
2563 if (socket_fd >= 0)
2564 dont_close[n_dont_close++] = socket_fd;
2565 if (n_fds > 0) {
2566 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2567 n_dont_close += n_fds;
2568 }
2569
2570 if (runtime)
2571 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2572
2573 if (dcreds) {
2574 if (dcreds->user)
2575 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2576 if (dcreds->group)
2577 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2578 }
2579
2580 if (user_lookup_fd >= 0)
2581 dont_close[n_dont_close++] = user_lookup_fd;
2582
2583 return close_all_fds(dont_close, n_dont_close);
2584 }
2585
2586 static int send_user_lookup(
2587 Unit *unit,
2588 int user_lookup_fd,
2589 uid_t uid,
2590 gid_t gid) {
2591
2592 assert(unit);
2593
2594 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2595 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2596 * specified. */
2597
2598 if (user_lookup_fd < 0)
2599 return 0;
2600
2601 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2602 return 0;
2603
2604 if (writev(user_lookup_fd,
2605 (struct iovec[]) {
2606 IOVEC_INIT(&uid, sizeof(uid)),
2607 IOVEC_INIT(&gid, sizeof(gid)),
2608 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2609 return -errno;
2610
2611 return 0;
2612 }
2613
2614 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2615 int r;
2616
2617 assert(c);
2618 assert(home);
2619 assert(buf);
2620
2621 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2622
2623 if (*home)
2624 return 0;
2625
2626 if (!c->working_directory_home)
2627 return 0;
2628
2629 if (uid == 0) {
2630 /* Hardcode /root as home directory for UID 0 */
2631 *home = "/root";
2632 return 1;
2633 }
2634
2635 r = get_home_dir(buf);
2636 if (r < 0)
2637 return r;
2638
2639 *home = *buf;
2640 return 1;
2641 }
2642
2643 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2644 _cleanup_strv_free_ char ** list = NULL;
2645 ExecDirectoryType t;
2646 int r;
2647
2648 assert(c);
2649 assert(p);
2650 assert(ret);
2651
2652 assert(c->dynamic_user);
2653
2654 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2655 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2656 * directories. */
2657
2658 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2659 char **i;
2660
2661 if (t == EXEC_DIRECTORY_CONFIGURATION)
2662 continue;
2663
2664 if (!p->prefix[t])
2665 continue;
2666
2667 STRV_FOREACH(i, c->directories[t].paths) {
2668 char *e;
2669
2670 if (t == EXEC_DIRECTORY_RUNTIME)
2671 e = strjoin(p->prefix[t], "/", *i);
2672 else
2673 e = strjoin(p->prefix[t], "/private/", *i);
2674 if (!e)
2675 return -ENOMEM;
2676
2677 r = strv_consume(&list, e);
2678 if (r < 0)
2679 return r;
2680 }
2681 }
2682
2683 *ret = TAKE_PTR(list);
2684
2685 return 0;
2686 }
2687
2688 static char *exec_command_line(char **argv);
2689
2690 static int exec_child(
2691 Unit *unit,
2692 const ExecCommand *command,
2693 const ExecContext *context,
2694 const ExecParameters *params,
2695 ExecRuntime *runtime,
2696 DynamicCreds *dcreds,
2697 char **argv,
2698 int socket_fd,
2699 int named_iofds[3],
2700 int *fds,
2701 size_t n_storage_fds,
2702 size_t n_socket_fds,
2703 char **files_env,
2704 int user_lookup_fd,
2705 int *exit_status) {
2706
2707 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2708 _cleanup_free_ char *home_buffer = NULL;
2709 _cleanup_free_ gid_t *supplementary_gids = NULL;
2710 const char *username = NULL, *groupname = NULL;
2711 const char *home = NULL, *shell = NULL;
2712 dev_t journal_stream_dev = 0;
2713 ino_t journal_stream_ino = 0;
2714 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2715 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2716 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2717 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2718 #if HAVE_SELINUX
2719 _cleanup_free_ char *mac_selinux_context_net = NULL;
2720 bool use_selinux = false;
2721 #endif
2722 #if ENABLE_SMACK
2723 bool use_smack = false;
2724 #endif
2725 #if HAVE_APPARMOR
2726 bool use_apparmor = false;
2727 #endif
2728 uid_t uid = UID_INVALID;
2729 gid_t gid = GID_INVALID;
2730 int r, ngids = 0;
2731 size_t n_fds;
2732 ExecDirectoryType dt;
2733 int secure_bits;
2734
2735 assert(unit);
2736 assert(command);
2737 assert(context);
2738 assert(params);
2739 assert(exit_status);
2740
2741 rename_process_from_path(command->path);
2742
2743 /* We reset exactly these signals, since they are the
2744 * only ones we set to SIG_IGN in the main daemon. All
2745 * others we leave untouched because we set them to
2746 * SIG_DFL or a valid handler initially, both of which
2747 * will be demoted to SIG_DFL. */
2748 (void) default_signals(SIGNALS_CRASH_HANDLER,
2749 SIGNALS_IGNORE, -1);
2750
2751 if (context->ignore_sigpipe)
2752 (void) ignore_signals(SIGPIPE, -1);
2753
2754 r = reset_signal_mask();
2755 if (r < 0) {
2756 *exit_status = EXIT_SIGNAL_MASK;
2757 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2758 }
2759
2760 if (params->idle_pipe)
2761 do_idle_pipe_dance(params->idle_pipe);
2762
2763 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2764 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2765 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2766 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2767
2768 log_forget_fds();
2769 log_set_open_when_needed(true);
2770
2771 /* In case anything used libc syslog(), close this here, too */
2772 closelog();
2773
2774 n_fds = n_storage_fds + n_socket_fds;
2775 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2776 if (r < 0) {
2777 *exit_status = EXIT_FDS;
2778 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2779 }
2780
2781 if (!context->same_pgrp)
2782 if (setsid() < 0) {
2783 *exit_status = EXIT_SETSID;
2784 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2785 }
2786
2787 exec_context_tty_reset(context, params);
2788
2789 if (unit_shall_confirm_spawn(unit)) {
2790 const char *vc = params->confirm_spawn;
2791 _cleanup_free_ char *cmdline = NULL;
2792
2793 cmdline = exec_command_line(argv);
2794 if (!cmdline) {
2795 *exit_status = EXIT_MEMORY;
2796 return log_oom();
2797 }
2798
2799 r = ask_for_confirmation(vc, unit, cmdline);
2800 if (r != CONFIRM_EXECUTE) {
2801 if (r == CONFIRM_PRETEND_SUCCESS) {
2802 *exit_status = EXIT_SUCCESS;
2803 return 0;
2804 }
2805 *exit_status = EXIT_CONFIRM;
2806 log_unit_error(unit, "Execution cancelled by the user");
2807 return -ECANCELED;
2808 }
2809 }
2810
2811 if (context->dynamic_user && dcreds) {
2812 _cleanup_strv_free_ char **suggested_paths = NULL;
2813
2814 /* Make sure we bypass our own NSS module for any NSS checks */
2815 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2816 *exit_status = EXIT_USER;
2817 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2818 }
2819
2820 r = compile_suggested_paths(context, params, &suggested_paths);
2821 if (r < 0) {
2822 *exit_status = EXIT_MEMORY;
2823 return log_oom();
2824 }
2825
2826 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2827 if (r < 0) {
2828 *exit_status = EXIT_USER;
2829 if (r == -EILSEQ) {
2830 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2831 return -EOPNOTSUPP;
2832 }
2833 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2834 }
2835
2836 if (!uid_is_valid(uid)) {
2837 *exit_status = EXIT_USER;
2838 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2839 return -ESRCH;
2840 }
2841
2842 if (!gid_is_valid(gid)) {
2843 *exit_status = EXIT_USER;
2844 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2845 return -ESRCH;
2846 }
2847
2848 if (dcreds->user)
2849 username = dcreds->user->name;
2850
2851 } else {
2852 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2853 if (r < 0) {
2854 *exit_status = EXIT_USER;
2855 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2856 }
2857
2858 r = get_fixed_group(context, &groupname, &gid);
2859 if (r < 0) {
2860 *exit_status = EXIT_GROUP;
2861 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2862 }
2863 }
2864
2865 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2866 r = get_supplementary_groups(context, username, groupname, gid,
2867 &supplementary_gids, &ngids);
2868 if (r < 0) {
2869 *exit_status = EXIT_GROUP;
2870 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2871 }
2872
2873 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2874 if (r < 0) {
2875 *exit_status = EXIT_USER;
2876 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2877 }
2878
2879 user_lookup_fd = safe_close(user_lookup_fd);
2880
2881 r = acquire_home(context, uid, &home, &home_buffer);
2882 if (r < 0) {
2883 *exit_status = EXIT_CHDIR;
2884 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2885 }
2886
2887 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2888 * must sure to drop O_NONBLOCK */
2889 if (socket_fd >= 0)
2890 (void) fd_nonblock(socket_fd, false);
2891
2892 r = setup_input(context, params, socket_fd, named_iofds);
2893 if (r < 0) {
2894 *exit_status = EXIT_STDIN;
2895 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2896 }
2897
2898 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2899 if (r < 0) {
2900 *exit_status = EXIT_STDOUT;
2901 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2902 }
2903
2904 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2905 if (r < 0) {
2906 *exit_status = EXIT_STDERR;
2907 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2908 }
2909
2910 if (params->cgroup_path) {
2911 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2912 if (r < 0) {
2913 *exit_status = EXIT_CGROUP;
2914 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2915 }
2916 }
2917
2918 if (context->oom_score_adjust_set) {
2919 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
2920 * prohibit write access to this file, and we shouldn't trip up over that. */
2921 r = set_oom_score_adjust(context->oom_score_adjust);
2922 if (IN_SET(r, -EPERM, -EACCES))
2923 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2924 else if (r < 0) {
2925 *exit_status = EXIT_OOM_ADJUST;
2926 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2927 }
2928 }
2929
2930 if (context->nice_set)
2931 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2932 *exit_status = EXIT_NICE;
2933 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2934 }
2935
2936 if (context->cpu_sched_set) {
2937 struct sched_param param = {
2938 .sched_priority = context->cpu_sched_priority,
2939 };
2940
2941 r = sched_setscheduler(0,
2942 context->cpu_sched_policy |
2943 (context->cpu_sched_reset_on_fork ?
2944 SCHED_RESET_ON_FORK : 0),
2945 &param);
2946 if (r < 0) {
2947 *exit_status = EXIT_SETSCHEDULER;
2948 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2949 }
2950 }
2951
2952 if (context->cpuset)
2953 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2954 *exit_status = EXIT_CPUAFFINITY;
2955 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2956 }
2957
2958 if (context->ioprio_set)
2959 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2960 *exit_status = EXIT_IOPRIO;
2961 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2962 }
2963
2964 if (context->timer_slack_nsec != NSEC_INFINITY)
2965 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2966 *exit_status = EXIT_TIMERSLACK;
2967 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2968 }
2969
2970 if (context->personality != PERSONALITY_INVALID) {
2971 r = safe_personality(context->personality);
2972 if (r < 0) {
2973 *exit_status = EXIT_PERSONALITY;
2974 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2975 }
2976 }
2977
2978 if (context->utmp_id)
2979 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2980 context->tty_path,
2981 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
2982 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2983 USER_PROCESS,
2984 username);
2985
2986 if (context->user) {
2987 r = chown_terminal(STDIN_FILENO, uid);
2988 if (r < 0) {
2989 *exit_status = EXIT_STDIN;
2990 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2991 }
2992 }
2993
2994 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
2995 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
2996 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
2997 * touch a single hierarchy too. */
2998 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2999 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3000 if (r < 0) {
3001 *exit_status = EXIT_CGROUP;
3002 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3003 }
3004 }
3005
3006 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3007 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3008 if (r < 0)
3009 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3010 }
3011
3012 r = build_environment(
3013 unit,
3014 context,
3015 params,
3016 n_fds,
3017 home,
3018 username,
3019 shell,
3020 journal_stream_dev,
3021 journal_stream_ino,
3022 &our_env);
3023 if (r < 0) {
3024 *exit_status = EXIT_MEMORY;
3025 return log_oom();
3026 }
3027
3028 r = build_pass_environment(context, &pass_env);
3029 if (r < 0) {
3030 *exit_status = EXIT_MEMORY;
3031 return log_oom();
3032 }
3033
3034 accum_env = strv_env_merge(5,
3035 params->environment,
3036 our_env,
3037 pass_env,
3038 context->environment,
3039 files_env,
3040 NULL);
3041 if (!accum_env) {
3042 *exit_status = EXIT_MEMORY;
3043 return log_oom();
3044 }
3045 accum_env = strv_env_clean(accum_env);
3046
3047 (void) umask(context->umask);
3048
3049 r = setup_keyring(unit, context, params, uid, gid);
3050 if (r < 0) {
3051 *exit_status = EXIT_KEYRING;
3052 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3053 }
3054
3055 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3056 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3057
3058 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3059 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3060
3061 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3062 if (needs_ambient_hack)
3063 needs_setuid = false;
3064 else
3065 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3066
3067 if (needs_sandboxing) {
3068 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3069 * present. The actual MAC context application will happen later, as late as possible, to avoid
3070 * impacting our own code paths. */
3071
3072 #if HAVE_SELINUX
3073 use_selinux = mac_selinux_use();
3074 #endif
3075 #if ENABLE_SMACK
3076 use_smack = mac_smack_use();
3077 #endif
3078 #if HAVE_APPARMOR
3079 use_apparmor = mac_apparmor_use();
3080 #endif
3081 }
3082
3083 if (needs_setuid) {
3084 if (context->pam_name && username) {
3085 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3086 if (r < 0) {
3087 *exit_status = EXIT_PAM;
3088 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3089 }
3090 }
3091 }
3092
3093 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3094 if (ns_type_supported(NAMESPACE_NET)) {
3095 r = setup_netns(runtime->netns_storage_socket);
3096 if (r < 0) {
3097 *exit_status = EXIT_NETWORK;
3098 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3099 }
3100 } else
3101 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3102 }
3103
3104 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3105 if (needs_mount_namespace) {
3106 r = apply_mount_namespace(unit, command, context, params, runtime);
3107 if (r < 0) {
3108 *exit_status = EXIT_NAMESPACE;
3109 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3110 }
3111 }
3112
3113 /* Apply just after mount namespace setup */
3114 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3115 if (r < 0)
3116 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3117
3118 /* Drop groups as early as possbile */
3119 if (needs_setuid) {
3120 r = enforce_groups(gid, supplementary_gids, ngids);
3121 if (r < 0) {
3122 *exit_status = EXIT_GROUP;
3123 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3124 }
3125 }
3126
3127 if (needs_sandboxing) {
3128 #if HAVE_SELINUX
3129 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3130 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3131 if (r < 0) {
3132 *exit_status = EXIT_SELINUX_CONTEXT;
3133 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3134 }
3135 }
3136 #endif
3137
3138 if (context->private_users) {
3139 r = setup_private_users(uid, gid);
3140 if (r < 0) {
3141 *exit_status = EXIT_USER;
3142 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3143 }
3144 }
3145 }
3146
3147 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3148 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3149 * was needed to upload the policy and can now be closed as well. */
3150 r = close_all_fds(fds, n_fds);
3151 if (r >= 0)
3152 r = shift_fds(fds, n_fds);
3153 if (r >= 0)
3154 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3155 if (r < 0) {
3156 *exit_status = EXIT_FDS;
3157 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3158 }
3159
3160 secure_bits = context->secure_bits;
3161
3162 if (needs_sandboxing) {
3163 uint64_t bset;
3164 int which_failed;
3165
3166 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3167 if (r < 0) {
3168 *exit_status = EXIT_LIMITS;
3169 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3170 }
3171
3172 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3173 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3174 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3175 *exit_status = EXIT_LIMITS;
3176 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3177 }
3178 }
3179
3180 #if ENABLE_SMACK
3181 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3182 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3183 if (use_smack) {
3184 r = setup_smack(context, command);
3185 if (r < 0) {
3186 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3187 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3188 }
3189 }
3190 #endif
3191
3192 bset = context->capability_bounding_set;
3193 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3194 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3195 * instead of us doing that */
3196 if (needs_ambient_hack)
3197 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3198 (UINT64_C(1) << CAP_SETUID) |
3199 (UINT64_C(1) << CAP_SETGID);
3200
3201 if (!cap_test_all(bset)) {
3202 r = capability_bounding_set_drop(bset, false);
3203 if (r < 0) {
3204 *exit_status = EXIT_CAPABILITIES;
3205 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3206 }
3207 }
3208
3209 /* This is done before enforce_user, but ambient set
3210 * does not survive over setresuid() if keep_caps is not set. */
3211 if (!needs_ambient_hack &&
3212 context->capability_ambient_set != 0) {
3213 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3214 if (r < 0) {
3215 *exit_status = EXIT_CAPABILITIES;
3216 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3217 }
3218 }
3219 }
3220
3221 if (needs_setuid) {
3222 if (context->user) {
3223 r = enforce_user(context, uid);
3224 if (r < 0) {
3225 *exit_status = EXIT_USER;
3226 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3227 }
3228
3229 if (!needs_ambient_hack &&
3230 context->capability_ambient_set != 0) {
3231
3232 /* Fix the ambient capabilities after user change. */
3233 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3234 if (r < 0) {
3235 *exit_status = EXIT_CAPABILITIES;
3236 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3237 }
3238
3239 /* If we were asked to change user and ambient capabilities
3240 * were requested, we had to add keep-caps to the securebits
3241 * so that we would maintain the inherited capability set
3242 * through the setresuid(). Make sure that the bit is added
3243 * also to the context secure_bits so that we don't try to
3244 * drop the bit away next. */
3245
3246 secure_bits |= 1<<SECURE_KEEP_CAPS;
3247 }
3248 }
3249 }
3250
3251 if (needs_sandboxing) {
3252 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3253 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3254 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3255 * are restricted. */
3256
3257 #if HAVE_SELINUX
3258 if (use_selinux) {
3259 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3260
3261 if (exec_context) {
3262 r = setexeccon(exec_context);
3263 if (r < 0) {
3264 *exit_status = EXIT_SELINUX_CONTEXT;
3265 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3266 }
3267 }
3268 }
3269 #endif
3270
3271 #if HAVE_APPARMOR
3272 if (use_apparmor && context->apparmor_profile) {
3273 r = aa_change_onexec(context->apparmor_profile);
3274 if (r < 0 && !context->apparmor_profile_ignore) {
3275 *exit_status = EXIT_APPARMOR_PROFILE;
3276 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3277 }
3278 }
3279 #endif
3280
3281 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3282 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3283 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3284 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3285 *exit_status = EXIT_SECUREBITS;
3286 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3287 }
3288
3289 if (context_has_no_new_privileges(context))
3290 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3291 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3292 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3293 }
3294
3295 #if HAVE_SECCOMP
3296 r = apply_address_families(unit, context);
3297 if (r < 0) {
3298 *exit_status = EXIT_ADDRESS_FAMILIES;
3299 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3300 }
3301
3302 r = apply_memory_deny_write_execute(unit, context);
3303 if (r < 0) {
3304 *exit_status = EXIT_SECCOMP;
3305 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3306 }
3307
3308 r = apply_restrict_realtime(unit, context);
3309 if (r < 0) {
3310 *exit_status = EXIT_SECCOMP;
3311 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3312 }
3313
3314 r = apply_restrict_namespaces(unit, context);
3315 if (r < 0) {
3316 *exit_status = EXIT_SECCOMP;
3317 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3318 }
3319
3320 r = apply_protect_sysctl(unit, context);
3321 if (r < 0) {
3322 *exit_status = EXIT_SECCOMP;
3323 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3324 }
3325
3326 r = apply_protect_kernel_modules(unit, context);
3327 if (r < 0) {
3328 *exit_status = EXIT_SECCOMP;
3329 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3330 }
3331
3332 r = apply_private_devices(unit, context);
3333 if (r < 0) {
3334 *exit_status = EXIT_SECCOMP;
3335 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3336 }
3337
3338 r = apply_syscall_archs(unit, context);
3339 if (r < 0) {
3340 *exit_status = EXIT_SECCOMP;
3341 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3342 }
3343
3344 r = apply_lock_personality(unit, context);
3345 if (r < 0) {
3346 *exit_status = EXIT_SECCOMP;
3347 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3348 }
3349
3350 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3351 * by the filter as little as possible. */
3352 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3353 if (r < 0) {
3354 *exit_status = EXIT_SECCOMP;
3355 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3356 }
3357 #endif
3358 }
3359
3360 if (!strv_isempty(context->unset_environment)) {
3361 char **ee = NULL;
3362
3363 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3364 if (!ee) {
3365 *exit_status = EXIT_MEMORY;
3366 return log_oom();
3367 }
3368
3369 strv_free_and_replace(accum_env, ee);
3370 }
3371
3372 final_argv = replace_env_argv(argv, accum_env);
3373 if (!final_argv) {
3374 *exit_status = EXIT_MEMORY;
3375 return log_oom();
3376 }
3377
3378 if (DEBUG_LOGGING) {
3379 _cleanup_free_ char *line;
3380
3381 line = exec_command_line(final_argv);
3382 if (line) {
3383 log_struct(LOG_DEBUG,
3384 "EXECUTABLE=%s", command->path,
3385 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3386 LOG_UNIT_ID(unit),
3387 LOG_UNIT_INVOCATION_ID(unit),
3388 NULL);
3389 }
3390 }
3391
3392 execve(command->path, final_argv, accum_env);
3393
3394 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3395
3396 log_struct_errno(LOG_INFO, errno,
3397 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3398 LOG_UNIT_ID(unit),
3399 LOG_UNIT_INVOCATION_ID(unit),
3400 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3401 command->path),
3402 "EXECUTABLE=%s", command->path,
3403 NULL);
3404
3405 return 0;
3406 }
3407
3408 *exit_status = EXIT_EXEC;
3409 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3410 }
3411
3412 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3413 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3414
3415 int exec_spawn(Unit *unit,
3416 ExecCommand *command,
3417 const ExecContext *context,
3418 const ExecParameters *params,
3419 ExecRuntime *runtime,
3420 DynamicCreds *dcreds,
3421 pid_t *ret) {
3422
3423 _cleanup_strv_free_ char **files_env = NULL;
3424 int *fds = NULL;
3425 size_t n_storage_fds = 0, n_socket_fds = 0;
3426 _cleanup_free_ char *line = NULL;
3427 int socket_fd, r;
3428 int named_iofds[3] = { -1, -1, -1 };
3429 char **argv;
3430 pid_t pid;
3431
3432 assert(unit);
3433 assert(command);
3434 assert(context);
3435 assert(ret);
3436 assert(params);
3437 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3438
3439 if (context->std_input == EXEC_INPUT_SOCKET ||
3440 context->std_output == EXEC_OUTPUT_SOCKET ||
3441 context->std_error == EXEC_OUTPUT_SOCKET) {
3442
3443 if (params->n_socket_fds > 1) {
3444 log_unit_error(unit, "Got more than one socket.");
3445 return -EINVAL;
3446 }
3447
3448 if (params->n_socket_fds == 0) {
3449 log_unit_error(unit, "Got no socket.");
3450 return -EINVAL;
3451 }
3452
3453 socket_fd = params->fds[0];
3454 } else {
3455 socket_fd = -1;
3456 fds = params->fds;
3457 n_storage_fds = params->n_storage_fds;
3458 n_socket_fds = params->n_socket_fds;
3459 }
3460
3461 r = exec_context_named_iofds(context, params, named_iofds);
3462 if (r < 0)
3463 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3464
3465 r = exec_context_load_environment(unit, context, &files_env);
3466 if (r < 0)
3467 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3468
3469 argv = params->argv ?: command->argv;
3470 line = exec_command_line(argv);
3471 if (!line)
3472 return log_oom();
3473
3474 log_struct(LOG_DEBUG,
3475 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3476 "EXECUTABLE=%s", command->path,
3477 LOG_UNIT_ID(unit),
3478 LOG_UNIT_INVOCATION_ID(unit),
3479 NULL);
3480
3481 pid = fork();
3482 if (pid < 0)
3483 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3484
3485 if (pid == 0) {
3486 int exit_status = EXIT_SUCCESS;
3487
3488 r = exec_child(unit,
3489 command,
3490 context,
3491 params,
3492 runtime,
3493 dcreds,
3494 argv,
3495 socket_fd,
3496 named_iofds,
3497 fds,
3498 n_storage_fds,
3499 n_socket_fds,
3500 files_env,
3501 unit->manager->user_lookup_fds[1],
3502 &exit_status);
3503
3504 if (r < 0) {
3505 log_struct_errno(LOG_ERR, r,
3506 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3507 LOG_UNIT_ID(unit),
3508 LOG_UNIT_INVOCATION_ID(unit),
3509 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3510 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3511 command->path),
3512 "EXECUTABLE=%s", command->path,
3513 NULL);
3514 }
3515
3516 _exit(exit_status);
3517 }
3518
3519 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3520
3521 /* We add the new process to the cgroup both in the child (so
3522 * that we can be sure that no user code is ever executed
3523 * outside of the cgroup) and in the parent (so that we can be
3524 * sure that when we kill the cgroup the process will be
3525 * killed too). */
3526 if (params->cgroup_path)
3527 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3528
3529 exec_status_start(&command->exec_status, pid);
3530
3531 *ret = pid;
3532 return 0;
3533 }
3534
3535 void exec_context_init(ExecContext *c) {
3536 ExecDirectoryType i;
3537
3538 assert(c);
3539
3540 c->umask = 0022;
3541 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3542 c->cpu_sched_policy = SCHED_OTHER;
3543 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3544 c->syslog_level_prefix = true;
3545 c->ignore_sigpipe = true;
3546 c->timer_slack_nsec = NSEC_INFINITY;
3547 c->personality = PERSONALITY_INVALID;
3548 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3549 c->directories[i].mode = 0755;
3550 c->capability_bounding_set = CAP_ALL;
3551 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3552 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3553 c->log_level_max = -1;
3554 }
3555
3556 void exec_context_done(ExecContext *c) {
3557 ExecDirectoryType i;
3558 size_t l;
3559
3560 assert(c);
3561
3562 c->environment = strv_free(c->environment);
3563 c->environment_files = strv_free(c->environment_files);
3564 c->pass_environment = strv_free(c->pass_environment);
3565 c->unset_environment = strv_free(c->unset_environment);
3566
3567 rlimit_free_all(c->rlimit);
3568
3569 for (l = 0; l < 3; l++) {
3570 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3571 c->stdio_file[l] = mfree(c->stdio_file[l]);
3572 }
3573
3574 c->working_directory = mfree(c->working_directory);
3575 c->root_directory = mfree(c->root_directory);
3576 c->root_image = mfree(c->root_image);
3577 c->tty_path = mfree(c->tty_path);
3578 c->syslog_identifier = mfree(c->syslog_identifier);
3579 c->user = mfree(c->user);
3580 c->group = mfree(c->group);
3581
3582 c->supplementary_groups = strv_free(c->supplementary_groups);
3583
3584 c->pam_name = mfree(c->pam_name);
3585
3586 c->read_only_paths = strv_free(c->read_only_paths);
3587 c->read_write_paths = strv_free(c->read_write_paths);
3588 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3589
3590 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3591 c->bind_mounts = NULL;
3592 c->n_bind_mounts = 0;
3593 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3594 c->temporary_filesystems = NULL;
3595 c->n_temporary_filesystems = 0;
3596
3597 c->cpuset = cpu_set_mfree(c->cpuset);
3598
3599 c->utmp_id = mfree(c->utmp_id);
3600 c->selinux_context = mfree(c->selinux_context);
3601 c->apparmor_profile = mfree(c->apparmor_profile);
3602 c->smack_process_label = mfree(c->smack_process_label);
3603
3604 c->syscall_filter = hashmap_free(c->syscall_filter);
3605 c->syscall_archs = set_free(c->syscall_archs);
3606 c->address_families = set_free(c->address_families);
3607
3608 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3609 c->directories[i].paths = strv_free(c->directories[i].paths);
3610
3611 c->log_level_max = -1;
3612
3613 exec_context_free_log_extra_fields(c);
3614
3615 c->stdin_data = mfree(c->stdin_data);
3616 c->stdin_data_size = 0;
3617 }
3618
3619 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3620 char **i;
3621
3622 assert(c);
3623
3624 if (!runtime_prefix)
3625 return 0;
3626
3627 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3628 _cleanup_free_ char *p;
3629
3630 p = strjoin(runtime_prefix, "/", *i);
3631 if (!p)
3632 return -ENOMEM;
3633
3634 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3635 * next. */
3636 (void) rm_rf(p, REMOVE_ROOT);
3637 }
3638
3639 return 0;
3640 }
3641
3642 static void exec_command_done(ExecCommand *c) {
3643 assert(c);
3644
3645 c->path = mfree(c->path);
3646
3647 c->argv = strv_free(c->argv);
3648 }
3649
3650 void exec_command_done_array(ExecCommand *c, size_t n) {
3651 size_t i;
3652
3653 for (i = 0; i < n; i++)
3654 exec_command_done(c+i);
3655 }
3656
3657 ExecCommand* exec_command_free_list(ExecCommand *c) {
3658 ExecCommand *i;
3659
3660 while ((i = c)) {
3661 LIST_REMOVE(command, c, i);
3662 exec_command_done(i);
3663 free(i);
3664 }
3665
3666 return NULL;
3667 }
3668
3669 void exec_command_free_array(ExecCommand **c, size_t n) {
3670 size_t i;
3671
3672 for (i = 0; i < n; i++)
3673 c[i] = exec_command_free_list(c[i]);
3674 }
3675
3676 typedef struct InvalidEnvInfo {
3677 const Unit *unit;
3678 const char *path;
3679 } InvalidEnvInfo;
3680
3681 static void invalid_env(const char *p, void *userdata) {
3682 InvalidEnvInfo *info = userdata;
3683
3684 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3685 }
3686
3687 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3688 assert(c);
3689
3690 switch (fd_index) {
3691
3692 case STDIN_FILENO:
3693 if (c->std_input != EXEC_INPUT_NAMED_FD)
3694 return NULL;
3695
3696 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3697
3698 case STDOUT_FILENO:
3699 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3700 return NULL;
3701
3702 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3703
3704 case STDERR_FILENO:
3705 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3706 return NULL;
3707
3708 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3709
3710 default:
3711 return NULL;
3712 }
3713 }
3714
3715 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3716 size_t i, targets;
3717 const char* stdio_fdname[3];
3718 size_t n_fds;
3719
3720 assert(c);
3721 assert(p);
3722
3723 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3724 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3725 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3726
3727 for (i = 0; i < 3; i++)
3728 stdio_fdname[i] = exec_context_fdname(c, i);
3729
3730 n_fds = p->n_storage_fds + p->n_socket_fds;
3731
3732 for (i = 0; i < n_fds && targets > 0; i++)
3733 if (named_iofds[STDIN_FILENO] < 0 &&
3734 c->std_input == EXEC_INPUT_NAMED_FD &&
3735 stdio_fdname[STDIN_FILENO] &&
3736 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3737
3738 named_iofds[STDIN_FILENO] = p->fds[i];
3739 targets--;
3740
3741 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3742 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3743 stdio_fdname[STDOUT_FILENO] &&
3744 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3745
3746 named_iofds[STDOUT_FILENO] = p->fds[i];
3747 targets--;
3748
3749 } else if (named_iofds[STDERR_FILENO] < 0 &&
3750 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3751 stdio_fdname[STDERR_FILENO] &&
3752 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3753
3754 named_iofds[STDERR_FILENO] = p->fds[i];
3755 targets--;
3756 }
3757
3758 return targets == 0 ? 0 : -ENOENT;
3759 }
3760
3761 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3762 char **i, **r = NULL;
3763
3764 assert(c);
3765 assert(l);
3766
3767 STRV_FOREACH(i, c->environment_files) {
3768 char *fn;
3769 int k;
3770 unsigned n;
3771 bool ignore = false;
3772 char **p;
3773 _cleanup_globfree_ glob_t pglob = {};
3774
3775 fn = *i;
3776
3777 if (fn[0] == '-') {
3778 ignore = true;
3779 fn++;
3780 }
3781
3782 if (!path_is_absolute(fn)) {
3783 if (ignore)
3784 continue;
3785
3786 strv_free(r);
3787 return -EINVAL;
3788 }
3789
3790 /* Filename supports globbing, take all matching files */
3791 k = safe_glob(fn, 0, &pglob);
3792 if (k < 0) {
3793 if (ignore)
3794 continue;
3795
3796 strv_free(r);
3797 return k;
3798 }
3799
3800 /* When we don't match anything, -ENOENT should be returned */
3801 assert(pglob.gl_pathc > 0);
3802
3803 for (n = 0; n < pglob.gl_pathc; n++) {
3804 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3805 if (k < 0) {
3806 if (ignore)
3807 continue;
3808
3809 strv_free(r);
3810 return k;
3811 }
3812 /* Log invalid environment variables with filename */
3813 if (p) {
3814 InvalidEnvInfo info = {
3815 .unit = unit,
3816 .path = pglob.gl_pathv[n]
3817 };
3818
3819 p = strv_env_clean_with_callback(p, invalid_env, &info);
3820 }
3821
3822 if (!r)
3823 r = p;
3824 else {
3825 char **m;
3826
3827 m = strv_env_merge(2, r, p);
3828 strv_free(r);
3829 strv_free(p);
3830 if (!m)
3831 return -ENOMEM;
3832
3833 r = m;
3834 }
3835 }
3836 }
3837
3838 *l = r;
3839
3840 return 0;
3841 }
3842
3843 static bool tty_may_match_dev_console(const char *tty) {
3844 _cleanup_free_ char *resolved = NULL;
3845
3846 if (!tty)
3847 return true;
3848
3849 tty = skip_dev_prefix(tty);
3850
3851 /* trivial identity? */
3852 if (streq(tty, "console"))
3853 return true;
3854
3855 if (resolve_dev_console(&resolved) < 0)
3856 return true; /* if we could not resolve, assume it may */
3857
3858 /* "tty0" means the active VC, so it may be the same sometimes */
3859 return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
3860 }
3861
3862 bool exec_context_may_touch_console(const ExecContext *ec) {
3863
3864 return (ec->tty_reset ||
3865 ec->tty_vhangup ||
3866 ec->tty_vt_disallocate ||
3867 is_terminal_input(ec->std_input) ||
3868 is_terminal_output(ec->std_output) ||
3869 is_terminal_output(ec->std_error)) &&
3870 tty_may_match_dev_console(exec_context_tty_path(ec));
3871 }
3872
3873 static void strv_fprintf(FILE *f, char **l) {
3874 char **g;
3875
3876 assert(f);
3877
3878 STRV_FOREACH(g, l)
3879 fprintf(f, " %s", *g);
3880 }
3881
3882 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
3883 ExecDirectoryType dt;
3884 char **e, **d;
3885 unsigned i;
3886 int r;
3887
3888 assert(c);
3889 assert(f);
3890
3891 prefix = strempty(prefix);
3892
3893 fprintf(f,
3894 "%sUMask: %04o\n"
3895 "%sWorkingDirectory: %s\n"
3896 "%sRootDirectory: %s\n"
3897 "%sNonBlocking: %s\n"
3898 "%sPrivateTmp: %s\n"
3899 "%sPrivateDevices: %s\n"
3900 "%sProtectKernelTunables: %s\n"
3901 "%sProtectKernelModules: %s\n"
3902 "%sProtectControlGroups: %s\n"
3903 "%sPrivateNetwork: %s\n"
3904 "%sPrivateUsers: %s\n"
3905 "%sProtectHome: %s\n"
3906 "%sProtectSystem: %s\n"
3907 "%sMountAPIVFS: %s\n"
3908 "%sIgnoreSIGPIPE: %s\n"
3909 "%sMemoryDenyWriteExecute: %s\n"
3910 "%sRestrictRealtime: %s\n"
3911 "%sKeyringMode: %s\n",
3912 prefix, c->umask,
3913 prefix, c->working_directory ? c->working_directory : "/",
3914 prefix, c->root_directory ? c->root_directory : "/",
3915 prefix, yes_no(c->non_blocking),
3916 prefix, yes_no(c->private_tmp),
3917 prefix, yes_no(c->private_devices),
3918 prefix, yes_no(c->protect_kernel_tunables),
3919 prefix, yes_no(c->protect_kernel_modules),
3920 prefix, yes_no(c->protect_control_groups),
3921 prefix, yes_no(c->private_network),
3922 prefix, yes_no(c->private_users),
3923 prefix, protect_home_to_string(c->protect_home),
3924 prefix, protect_system_to_string(c->protect_system),
3925 prefix, yes_no(c->mount_apivfs),
3926 prefix, yes_no(c->ignore_sigpipe),
3927 prefix, yes_no(c->memory_deny_write_execute),
3928 prefix, yes_no(c->restrict_realtime),
3929 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3930
3931 if (c->root_image)
3932 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3933
3934 STRV_FOREACH(e, c->environment)
3935 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3936
3937 STRV_FOREACH(e, c->environment_files)
3938 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3939
3940 STRV_FOREACH(e, c->pass_environment)
3941 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3942
3943 STRV_FOREACH(e, c->unset_environment)
3944 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3945
3946 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3947
3948 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3949 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3950
3951 STRV_FOREACH(d, c->directories[dt].paths)
3952 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3953 }
3954
3955 if (c->nice_set)
3956 fprintf(f,
3957 "%sNice: %i\n",
3958 prefix, c->nice);
3959
3960 if (c->oom_score_adjust_set)
3961 fprintf(f,
3962 "%sOOMScoreAdjust: %i\n",
3963 prefix, c->oom_score_adjust);
3964
3965 for (i = 0; i < RLIM_NLIMITS; i++)
3966 if (c->rlimit[i]) {
3967 fprintf(f, "Limit%s%s: " RLIM_FMT "\n",
3968 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3969 fprintf(f, "Limit%s%sSoft: " RLIM_FMT "\n",
3970 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3971 }
3972
3973 if (c->ioprio_set) {
3974 _cleanup_free_ char *class_str = NULL;
3975
3976 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3977 if (r >= 0)
3978 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3979
3980 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3981 }
3982
3983 if (c->cpu_sched_set) {
3984 _cleanup_free_ char *policy_str = NULL;
3985
3986 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3987 if (r >= 0)
3988 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3989
3990 fprintf(f,
3991 "%sCPUSchedulingPriority: %i\n"
3992 "%sCPUSchedulingResetOnFork: %s\n",
3993 prefix, c->cpu_sched_priority,
3994 prefix, yes_no(c->cpu_sched_reset_on_fork));
3995 }
3996
3997 if (c->cpuset) {
3998 fprintf(f, "%sCPUAffinity:", prefix);
3999 for (i = 0; i < c->cpuset_ncpus; i++)
4000 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4001 fprintf(f, " %u", i);
4002 fputs("\n", f);
4003 }
4004
4005 if (c->timer_slack_nsec != NSEC_INFINITY)
4006 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4007
4008 fprintf(f,
4009 "%sStandardInput: %s\n"
4010 "%sStandardOutput: %s\n"
4011 "%sStandardError: %s\n",
4012 prefix, exec_input_to_string(c->std_input),
4013 prefix, exec_output_to_string(c->std_output),
4014 prefix, exec_output_to_string(c->std_error));
4015
4016 if (c->std_input == EXEC_INPUT_NAMED_FD)
4017 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4018 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4019 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4020 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4021 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4022
4023 if (c->std_input == EXEC_INPUT_FILE)
4024 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4025 if (c->std_output == EXEC_OUTPUT_FILE)
4026 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4027 if (c->std_error == EXEC_OUTPUT_FILE)
4028 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4029
4030 if (c->tty_path)
4031 fprintf(f,
4032 "%sTTYPath: %s\n"
4033 "%sTTYReset: %s\n"
4034 "%sTTYVHangup: %s\n"
4035 "%sTTYVTDisallocate: %s\n",
4036 prefix, c->tty_path,
4037 prefix, yes_no(c->tty_reset),
4038 prefix, yes_no(c->tty_vhangup),
4039 prefix, yes_no(c->tty_vt_disallocate));
4040
4041 if (IN_SET(c->std_output,
4042 EXEC_OUTPUT_SYSLOG,
4043 EXEC_OUTPUT_KMSG,
4044 EXEC_OUTPUT_JOURNAL,
4045 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4046 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4047 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4048 IN_SET(c->std_error,
4049 EXEC_OUTPUT_SYSLOG,
4050 EXEC_OUTPUT_KMSG,
4051 EXEC_OUTPUT_JOURNAL,
4052 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4053 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4054 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4055
4056 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4057
4058 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4059 if (r >= 0)
4060 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4061
4062 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4063 if (r >= 0)
4064 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4065 }
4066
4067 if (c->log_level_max >= 0) {
4068 _cleanup_free_ char *t = NULL;
4069
4070 (void) log_level_to_string_alloc(c->log_level_max, &t);
4071
4072 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4073 }
4074
4075 if (c->n_log_extra_fields > 0) {
4076 size_t j;
4077
4078 for (j = 0; j < c->n_log_extra_fields; j++) {
4079 fprintf(f, "%sLogExtraFields: ", prefix);
4080 fwrite(c->log_extra_fields[j].iov_base,
4081 1, c->log_extra_fields[j].iov_len,
4082 f);
4083 fputc('\n', f);
4084 }
4085 }
4086
4087 if (c->secure_bits) {
4088 _cleanup_free_ char *str = NULL;
4089
4090 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4091 if (r >= 0)
4092 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4093 }
4094
4095 if (c->capability_bounding_set != CAP_ALL) {
4096 _cleanup_free_ char *str = NULL;
4097
4098 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4099 if (r >= 0)
4100 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4101 }
4102
4103 if (c->capability_ambient_set != 0) {
4104 _cleanup_free_ char *str = NULL;
4105
4106 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4107 if (r >= 0)
4108 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4109 }
4110
4111 if (c->user)
4112 fprintf(f, "%sUser: %s\n", prefix, c->user);
4113 if (c->group)
4114 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4115
4116 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4117
4118 if (!strv_isempty(c->supplementary_groups)) {
4119 fprintf(f, "%sSupplementaryGroups:", prefix);
4120 strv_fprintf(f, c->supplementary_groups);
4121 fputs("\n", f);
4122 }
4123
4124 if (c->pam_name)
4125 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4126
4127 if (!strv_isempty(c->read_write_paths)) {
4128 fprintf(f, "%sReadWritePaths:", prefix);
4129 strv_fprintf(f, c->read_write_paths);
4130 fputs("\n", f);
4131 }
4132
4133 if (!strv_isempty(c->read_only_paths)) {
4134 fprintf(f, "%sReadOnlyPaths:", prefix);
4135 strv_fprintf(f, c->read_only_paths);
4136 fputs("\n", f);
4137 }
4138
4139 if (!strv_isempty(c->inaccessible_paths)) {
4140 fprintf(f, "%sInaccessiblePaths:", prefix);
4141 strv_fprintf(f, c->inaccessible_paths);
4142 fputs("\n", f);
4143 }
4144
4145 if (c->n_bind_mounts > 0)
4146 for (i = 0; i < c->n_bind_mounts; i++)
4147 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4148 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4149 c->bind_mounts[i].ignore_enoent ? "-": "",
4150 c->bind_mounts[i].source,
4151 c->bind_mounts[i].destination,
4152 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4153
4154 if (c->n_temporary_filesystems > 0)
4155 for (i = 0; i < c->n_temporary_filesystems; i++) {
4156 TemporaryFileSystem *t = c->temporary_filesystems + i;
4157
4158 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4159 t->path,
4160 isempty(t->options) ? "" : ":",
4161 strempty(t->options));
4162 }
4163
4164 if (c->utmp_id)
4165 fprintf(f,
4166 "%sUtmpIdentifier: %s\n",
4167 prefix, c->utmp_id);
4168
4169 if (c->selinux_context)
4170 fprintf(f,
4171 "%sSELinuxContext: %s%s\n",
4172 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4173
4174 if (c->apparmor_profile)
4175 fprintf(f,
4176 "%sAppArmorProfile: %s%s\n",
4177 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4178
4179 if (c->smack_process_label)
4180 fprintf(f,
4181 "%sSmackProcessLabel: %s%s\n",
4182 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4183
4184 if (c->personality != PERSONALITY_INVALID)
4185 fprintf(f,
4186 "%sPersonality: %s\n",
4187 prefix, strna(personality_to_string(c->personality)));
4188
4189 fprintf(f,
4190 "%sLockPersonality: %s\n",
4191 prefix, yes_no(c->lock_personality));
4192
4193 if (c->syscall_filter) {
4194 #if HAVE_SECCOMP
4195 Iterator j;
4196 void *id, *val;
4197 bool first = true;
4198 #endif
4199
4200 fprintf(f,
4201 "%sSystemCallFilter: ",
4202 prefix);
4203
4204 if (!c->syscall_whitelist)
4205 fputc('~', f);
4206
4207 #if HAVE_SECCOMP
4208 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4209 _cleanup_free_ char *name = NULL;
4210 const char *errno_name = NULL;
4211 int num = PTR_TO_INT(val);
4212
4213 if (first)
4214 first = false;
4215 else
4216 fputc(' ', f);
4217
4218 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4219 fputs(strna(name), f);
4220
4221 if (num >= 0) {
4222 errno_name = errno_to_name(num);
4223 if (errno_name)
4224 fprintf(f, ":%s", errno_name);
4225 else
4226 fprintf(f, ":%d", num);
4227 }
4228 }
4229 #endif
4230
4231 fputc('\n', f);
4232 }
4233
4234 if (c->syscall_archs) {
4235 #if HAVE_SECCOMP
4236 Iterator j;
4237 void *id;
4238 #endif
4239
4240 fprintf(f,
4241 "%sSystemCallArchitectures:",
4242 prefix);
4243
4244 #if HAVE_SECCOMP
4245 SET_FOREACH(id, c->syscall_archs, j)
4246 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4247 #endif
4248 fputc('\n', f);
4249 }
4250
4251 if (exec_context_restrict_namespaces_set(c)) {
4252 _cleanup_free_ char *s = NULL;
4253
4254 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4255 if (r >= 0)
4256 fprintf(f, "%sRestrictNamespaces: %s\n",
4257 prefix, s);
4258 }
4259
4260 if (c->syscall_errno > 0) {
4261 const char *errno_name;
4262
4263 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4264
4265 errno_name = errno_to_name(c->syscall_errno);
4266 if (errno_name)
4267 fprintf(f, "%s\n", errno_name);
4268 else
4269 fprintf(f, "%d\n", c->syscall_errno);
4270 }
4271
4272 if (c->apparmor_profile)
4273 fprintf(f,
4274 "%sAppArmorProfile: %s%s\n",
4275 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4276 }
4277
4278 bool exec_context_maintains_privileges(const ExecContext *c) {
4279 assert(c);
4280
4281 /* Returns true if the process forked off would run under
4282 * an unchanged UID or as root. */
4283
4284 if (!c->user)
4285 return true;
4286
4287 if (streq(c->user, "root") || streq(c->user, "0"))
4288 return true;
4289
4290 return false;
4291 }
4292
4293 int exec_context_get_effective_ioprio(const ExecContext *c) {
4294 int p;
4295
4296 assert(c);
4297
4298 if (c->ioprio_set)
4299 return c->ioprio;
4300
4301 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4302 if (p < 0)
4303 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4304
4305 return p;
4306 }
4307
4308 void exec_context_free_log_extra_fields(ExecContext *c) {
4309 size_t l;
4310
4311 assert(c);
4312
4313 for (l = 0; l < c->n_log_extra_fields; l++)
4314 free(c->log_extra_fields[l].iov_base);
4315 c->log_extra_fields = mfree(c->log_extra_fields);
4316 c->n_log_extra_fields = 0;
4317 }
4318
4319 void exec_status_start(ExecStatus *s, pid_t pid) {
4320 assert(s);
4321
4322 zero(*s);
4323 s->pid = pid;
4324 dual_timestamp_get(&s->start_timestamp);
4325 }
4326
4327 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4328 assert(s);
4329
4330 if (s->pid && s->pid != pid)
4331 zero(*s);
4332
4333 s->pid = pid;
4334 dual_timestamp_get(&s->exit_timestamp);
4335
4336 s->code = code;
4337 s->status = status;
4338
4339 if (context) {
4340 if (context->utmp_id)
4341 utmp_put_dead_process(context->utmp_id, pid, code, status);
4342
4343 exec_context_tty_reset(context, NULL);
4344 }
4345 }
4346
4347 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4348 char buf[FORMAT_TIMESTAMP_MAX];
4349
4350 assert(s);
4351 assert(f);
4352
4353 if (s->pid <= 0)
4354 return;
4355
4356 prefix = strempty(prefix);
4357
4358 fprintf(f,
4359 "%sPID: "PID_FMT"\n",
4360 prefix, s->pid);
4361
4362 if (dual_timestamp_is_set(&s->start_timestamp))
4363 fprintf(f,
4364 "%sStart Timestamp: %s\n",
4365 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4366
4367 if (dual_timestamp_is_set(&s->exit_timestamp))
4368 fprintf(f,
4369 "%sExit Timestamp: %s\n"
4370 "%sExit Code: %s\n"
4371 "%sExit Status: %i\n",
4372 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4373 prefix, sigchld_code_to_string(s->code),
4374 prefix, s->status);
4375 }
4376
4377 static char *exec_command_line(char **argv) {
4378 size_t k;
4379 char *n, *p, **a;
4380 bool first = true;
4381
4382 assert(argv);
4383
4384 k = 1;
4385 STRV_FOREACH(a, argv)
4386 k += strlen(*a)+3;
4387
4388 n = new(char, k);
4389 if (!n)
4390 return NULL;
4391
4392 p = n;
4393 STRV_FOREACH(a, argv) {
4394
4395 if (!first)
4396 *(p++) = ' ';
4397 else
4398 first = false;
4399
4400 if (strpbrk(*a, WHITESPACE)) {
4401 *(p++) = '\'';
4402 p = stpcpy(p, *a);
4403 *(p++) = '\'';
4404 } else
4405 p = stpcpy(p, *a);
4406
4407 }
4408
4409 *p = 0;
4410
4411 /* FIXME: this doesn't really handle arguments that have
4412 * spaces and ticks in them */
4413
4414 return n;
4415 }
4416
4417 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4418 _cleanup_free_ char *cmd = NULL;
4419 const char *prefix2;
4420
4421 assert(c);
4422 assert(f);
4423
4424 prefix = strempty(prefix);
4425 prefix2 = strjoina(prefix, "\t");
4426
4427 cmd = exec_command_line(c->argv);
4428 fprintf(f,
4429 "%sCommand Line: %s\n",
4430 prefix, cmd ? cmd : strerror(ENOMEM));
4431
4432 exec_status_dump(&c->exec_status, f, prefix2);
4433 }
4434
4435 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4436 assert(f);
4437
4438 prefix = strempty(prefix);
4439
4440 LIST_FOREACH(command, c, c)
4441 exec_command_dump(c, f, prefix);
4442 }
4443
4444 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4445 ExecCommand *end;
4446
4447 assert(l);
4448 assert(e);
4449
4450 if (*l) {
4451 /* It's kind of important, that we keep the order here */
4452 LIST_FIND_TAIL(command, *l, end);
4453 LIST_INSERT_AFTER(command, *l, end, e);
4454 } else
4455 *l = e;
4456 }
4457
4458 int exec_command_set(ExecCommand *c, const char *path, ...) {
4459 va_list ap;
4460 char **l, *p;
4461
4462 assert(c);
4463 assert(path);
4464
4465 va_start(ap, path);
4466 l = strv_new_ap(path, ap);
4467 va_end(ap);
4468
4469 if (!l)
4470 return -ENOMEM;
4471
4472 p = strdup(path);
4473 if (!p) {
4474 strv_free(l);
4475 return -ENOMEM;
4476 }
4477
4478 free(c->path);
4479 c->path = p;
4480
4481 return strv_free_and_replace(c->argv, l);
4482 }
4483
4484 int exec_command_append(ExecCommand *c, const char *path, ...) {
4485 _cleanup_strv_free_ char **l = NULL;
4486 va_list ap;
4487 int r;
4488
4489 assert(c);
4490 assert(path);
4491
4492 va_start(ap, path);
4493 l = strv_new_ap(path, ap);
4494 va_end(ap);
4495
4496 if (!l)
4497 return -ENOMEM;
4498
4499 r = strv_extend_strv(&c->argv, l, false);
4500 if (r < 0)
4501 return r;
4502
4503 return 0;
4504 }
4505
4506 static void *remove_tmpdir_thread(void *p) {
4507 _cleanup_free_ char *path = p;
4508
4509 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4510 return NULL;
4511 }
4512
4513 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4514 int r;
4515
4516 if (!rt)
4517 return NULL;
4518
4519 if (rt->manager)
4520 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4521
4522 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4523 if (destroy && rt->tmp_dir) {
4524 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4525
4526 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4527 if (r < 0) {
4528 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4529 free(rt->tmp_dir);
4530 }
4531
4532 rt->tmp_dir = NULL;
4533 }
4534
4535 if (destroy && rt->var_tmp_dir) {
4536 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4537
4538 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4539 if (r < 0) {
4540 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4541 free(rt->var_tmp_dir);
4542 }
4543
4544 rt->var_tmp_dir = NULL;
4545 }
4546
4547 rt->id = mfree(rt->id);
4548 rt->tmp_dir = mfree(rt->tmp_dir);
4549 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4550 safe_close_pair(rt->netns_storage_socket);
4551 return mfree(rt);
4552 }
4553
4554 static void exec_runtime_freep(ExecRuntime **rt) {
4555 if (*rt)
4556 (void) exec_runtime_free(*rt, false);
4557 }
4558
4559 static int exec_runtime_allocate(ExecRuntime **rt) {
4560 assert(rt);
4561
4562 *rt = new0(ExecRuntime, 1);
4563 if (!*rt)
4564 return -ENOMEM;
4565
4566 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4567 return 0;
4568 }
4569
4570 static int exec_runtime_add(
4571 Manager *m,
4572 const char *id,
4573 const char *tmp_dir,
4574 const char *var_tmp_dir,
4575 const int netns_storage_socket[2],
4576 ExecRuntime **ret) {
4577
4578 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4579 int r;
4580
4581 assert(m);
4582 assert(id);
4583
4584 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4585 if (r < 0)
4586 return r;
4587
4588 r = exec_runtime_allocate(&rt);
4589 if (r < 0)
4590 return r;
4591
4592 rt->id = strdup(id);
4593 if (!rt->id)
4594 return -ENOMEM;
4595
4596 if (tmp_dir) {
4597 rt->tmp_dir = strdup(tmp_dir);
4598 if (!rt->tmp_dir)
4599 return -ENOMEM;
4600
4601 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4602 assert(var_tmp_dir);
4603 rt->var_tmp_dir = strdup(var_tmp_dir);
4604 if (!rt->var_tmp_dir)
4605 return -ENOMEM;
4606 }
4607
4608 if (netns_storage_socket) {
4609 rt->netns_storage_socket[0] = netns_storage_socket[0];
4610 rt->netns_storage_socket[1] = netns_storage_socket[1];
4611 }
4612
4613 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4614 if (r < 0)
4615 return r;
4616
4617 rt->manager = m;
4618
4619 if (ret)
4620 *ret = rt;
4621
4622 /* do not remove created ExecRuntime object when the operation succeeds. */
4623 rt = NULL;
4624 return 0;
4625 }
4626
4627 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4628 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4629 _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4630 int r;
4631
4632 assert(m);
4633 assert(c);
4634 assert(id);
4635
4636 /* It is not necessary to create ExecRuntime object. */
4637 if (!c->private_network && !c->private_tmp)
4638 return 0;
4639
4640 if (c->private_tmp) {
4641 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4642 if (r < 0)
4643 return r;
4644 }
4645
4646 if (c->private_network) {
4647 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4648 return -errno;
4649 }
4650
4651 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4652 if (r < 0)
4653 return r;
4654
4655 /* Avoid cleanup */
4656 netns_storage_socket[0] = -1;
4657 netns_storage_socket[1] = -1;
4658 return 1;
4659 }
4660
4661 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4662 ExecRuntime *rt;
4663 int r;
4664
4665 assert(m);
4666 assert(id);
4667 assert(ret);
4668
4669 rt = hashmap_get(m->exec_runtime_by_id, id);
4670 if (rt)
4671 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4672 goto ref;
4673
4674 if (!create)
4675 return 0;
4676
4677 /* If not found, then create a new object. */
4678 r = exec_runtime_make(m, c, id, &rt);
4679 if (r <= 0)
4680 /* When r == 0, it is not necessary to create ExecRuntime object. */
4681 return r;
4682
4683 ref:
4684 /* increment reference counter. */
4685 rt->n_ref++;
4686 *ret = rt;
4687 return 1;
4688 }
4689
4690 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4691 if (!rt)
4692 return NULL;
4693
4694 assert(rt->n_ref > 0);
4695
4696 rt->n_ref--;
4697 if (rt->n_ref > 0)
4698 return NULL;
4699
4700 return exec_runtime_free(rt, destroy);
4701 }
4702
4703 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4704 ExecRuntime *rt;
4705 Iterator i;
4706
4707 assert(m);
4708 assert(f);
4709 assert(fds);
4710
4711 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4712 fprintf(f, "exec-runtime=%s", rt->id);
4713
4714 if (rt->tmp_dir)
4715 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4716
4717 if (rt->var_tmp_dir)
4718 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4719
4720 if (rt->netns_storage_socket[0] >= 0) {
4721 int copy;
4722
4723 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4724 if (copy < 0)
4725 return copy;
4726
4727 fprintf(f, " netns-socket-0=%i", copy);
4728 }
4729
4730 if (rt->netns_storage_socket[1] >= 0) {
4731 int copy;
4732
4733 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4734 if (copy < 0)
4735 return copy;
4736
4737 fprintf(f, " netns-socket-1=%i", copy);
4738 }
4739
4740 fputc('\n', f);
4741 }
4742
4743 return 0;
4744 }
4745
4746 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4747 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4748 ExecRuntime *rt;
4749 int r;
4750
4751 /* This is for the migration from old (v237 or earlier) deserialization text.
4752 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4753 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4754 * so or not from the serialized text, then we always creates a new object owned by this. */
4755
4756 assert(u);
4757 assert(key);
4758 assert(value);
4759
4760 /* Manager manages ExecRuntime objects by the unit id.
4761 * So, we omit the serialized text when the unit does not have id (yet?)... */
4762 if (isempty(u->id)) {
4763 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4764 return 0;
4765 }
4766
4767 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4768 if (r < 0) {
4769 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4770 return 0;
4771 }
4772
4773 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4774 if (!rt) {
4775 r = exec_runtime_allocate(&rt_create);
4776 if (r < 0)
4777 return log_oom();
4778
4779 rt_create->id = strdup(u->id);
4780 if (!rt_create->id)
4781 return log_oom();
4782
4783 rt = rt_create;
4784 }
4785
4786 if (streq(key, "tmp-dir")) {
4787 char *copy;
4788
4789 copy = strdup(value);
4790 if (!copy)
4791 return log_oom();
4792
4793 free_and_replace(rt->tmp_dir, copy);
4794
4795 } else if (streq(key, "var-tmp-dir")) {
4796 char *copy;
4797
4798 copy = strdup(value);
4799 if (!copy)
4800 return log_oom();
4801
4802 free_and_replace(rt->var_tmp_dir, copy);
4803
4804 } else if (streq(key, "netns-socket-0")) {
4805 int fd;
4806
4807 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4808 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4809 return 0;
4810 }
4811
4812 safe_close(rt->netns_storage_socket[0]);
4813 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4814
4815 } else if (streq(key, "netns-socket-1")) {
4816 int fd;
4817
4818 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4819 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4820 return 0;
4821 }
4822
4823 safe_close(rt->netns_storage_socket[1]);
4824 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4825 } else
4826 return 0;
4827
4828 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4829 if (rt_create) {
4830 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4831 if (r < 0) {
4832 log_unit_debug_errno(u, r, "Failed to put runtime paramter to manager's storage: %m");
4833 return 0;
4834 }
4835
4836 rt_create->manager = u->manager;
4837
4838 /* Avoid cleanup */
4839 rt_create = NULL;
4840 }
4841
4842 return 1;
4843 }
4844
4845 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4846 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4847 int r, fd0 = -1, fd1 = -1;
4848 const char *p, *v = value;
4849 size_t n;
4850
4851 assert(m);
4852 assert(value);
4853 assert(fds);
4854
4855 n = strcspn(v, " ");
4856 id = strndupa(v, n);
4857 if (v[n] != ' ')
4858 goto finalize;
4859 p = v + n + 1;
4860
4861 v = startswith(p, "tmp-dir=");
4862 if (v) {
4863 n = strcspn(v, " ");
4864 tmp_dir = strndupa(v, n);
4865 if (v[n] != ' ')
4866 goto finalize;
4867 p = v + n + 1;
4868 }
4869
4870 v = startswith(p, "var-tmp-dir=");
4871 if (v) {
4872 n = strcspn(v, " ");
4873 var_tmp_dir = strndupa(v, n);
4874 if (v[n] != ' ')
4875 goto finalize;
4876 p = v + n + 1;
4877 }
4878
4879 v = startswith(p, "netns-socket-0=");
4880 if (v) {
4881 char *buf;
4882
4883 n = strcspn(v, " ");
4884 buf = strndupa(v, n);
4885 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
4886 log_debug("Unable to process exec-runtime netns fd specification.");
4887 return;
4888 }
4889 fd0 = fdset_remove(fds, fd0);
4890 if (v[n] != ' ')
4891 goto finalize;
4892 p = v + n + 1;
4893 }
4894
4895 v = startswith(p, "netns-socket-1=");
4896 if (v) {
4897 char *buf;
4898
4899 n = strcspn(v, " ");
4900 buf = strndupa(v, n);
4901 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
4902 log_debug("Unable to process exec-runtime netns fd specification.");
4903 return;
4904 }
4905 fd1 = fdset_remove(fds, fd1);
4906 }
4907
4908 finalize:
4909
4910 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
4911 if (r < 0) {
4912 log_debug_errno(r, "Failed to add exec-runtime: %m");
4913 return;
4914 }
4915 }
4916
4917 void exec_runtime_vacuum(Manager *m) {
4918 ExecRuntime *rt;
4919 Iterator i;
4920
4921 assert(m);
4922
4923 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
4924
4925 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4926 if (rt->n_ref > 0)
4927 continue;
4928
4929 (void) exec_runtime_free(rt, false);
4930 }
4931 }
4932
4933 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4934 [EXEC_INPUT_NULL] = "null",
4935 [EXEC_INPUT_TTY] = "tty",
4936 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4937 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4938 [EXEC_INPUT_SOCKET] = "socket",
4939 [EXEC_INPUT_NAMED_FD] = "fd",
4940 [EXEC_INPUT_DATA] = "data",
4941 [EXEC_INPUT_FILE] = "file",
4942 };
4943
4944 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4945
4946 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4947 [EXEC_OUTPUT_INHERIT] = "inherit",
4948 [EXEC_OUTPUT_NULL] = "null",
4949 [EXEC_OUTPUT_TTY] = "tty",
4950 [EXEC_OUTPUT_SYSLOG] = "syslog",
4951 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4952 [EXEC_OUTPUT_KMSG] = "kmsg",
4953 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4954 [EXEC_OUTPUT_JOURNAL] = "journal",
4955 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4956 [EXEC_OUTPUT_SOCKET] = "socket",
4957 [EXEC_OUTPUT_NAMED_FD] = "fd",
4958 [EXEC_OUTPUT_FILE] = "file",
4959 };
4960
4961 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4962
4963 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4964 [EXEC_UTMP_INIT] = "init",
4965 [EXEC_UTMP_LOGIN] = "login",
4966 [EXEC_UTMP_USER] = "user",
4967 };
4968
4969 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4970
4971 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4972 [EXEC_PRESERVE_NO] = "no",
4973 [EXEC_PRESERVE_YES] = "yes",
4974 [EXEC_PRESERVE_RESTART] = "restart",
4975 };
4976
4977 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4978
4979 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4980 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4981 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4982 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4983 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4984 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4985 };
4986
4987 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4988
4989 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4990 [EXEC_KEYRING_INHERIT] = "inherit",
4991 [EXEC_KEYRING_PRIVATE] = "private",
4992 [EXEC_KEYRING_SHARED] = "shared",
4993 };
4994
4995 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);