]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
rlimit-util: rework rlimit_{from|to}_string() to work without "Limit" prefix
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6 ***/
7
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <glob.h>
11 #include <grp.h>
12 #include <poll.h>
13 #include <signal.h>
14 #include <string.h>
15 #include <sys/capability.h>
16 #include <sys/eventfd.h>
17 #include <sys/mman.h>
18 #include <sys/personality.h>
19 #include <sys/prctl.h>
20 #include <sys/shm.h>
21 #include <sys/socket.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24 #include <sys/un.h>
25 #include <unistd.h>
26 #include <utmpx.h>
27
28 #if HAVE_PAM
29 #include <security/pam_appl.h>
30 #endif
31
32 #if HAVE_SELINUX
33 #include <selinux/selinux.h>
34 #endif
35
36 #if HAVE_SECCOMP
37 #include <seccomp.h>
38 #endif
39
40 #if HAVE_APPARMOR
41 #include <sys/apparmor.h>
42 #endif
43
44 #include "sd-messages.h"
45
46 #include "af-list.h"
47 #include "alloc-util.h"
48 #if HAVE_APPARMOR
49 #include "apparmor-util.h"
50 #endif
51 #include "async.h"
52 #include "barrier.h"
53 #include "cap-list.h"
54 #include "capability-util.h"
55 #include "chown-recursive.h"
56 #include "cpu-set-util.h"
57 #include "def.h"
58 #include "env-util.h"
59 #include "errno-list.h"
60 #include "execute.h"
61 #include "exit-status.h"
62 #include "fd-util.h"
63 #include "fileio.h"
64 #include "format-util.h"
65 #include "fs-util.h"
66 #include "glob-util.h"
67 #include "io-util.h"
68 #include "ioprio.h"
69 #include "label.h"
70 #include "log.h"
71 #include "macro.h"
72 #include "manager.h"
73 #include "missing.h"
74 #include "mkdir.h"
75 #include "namespace.h"
76 #include "parse-util.h"
77 #include "path-util.h"
78 #include "process-util.h"
79 #include "rlimit-util.h"
80 #include "rm-rf.h"
81 #if HAVE_SECCOMP
82 #include "seccomp-util.h"
83 #endif
84 #include "securebits.h"
85 #include "securebits-util.h"
86 #include "selinux-util.h"
87 #include "signal-util.h"
88 #include "smack-util.h"
89 #include "socket-util.h"
90 #include "special.h"
91 #include "stat-util.h"
92 #include "string-table.h"
93 #include "string-util.h"
94 #include "strv.h"
95 #include "syslog-util.h"
96 #include "terminal-util.h"
97 #include "unit.h"
98 #include "user-util.h"
99 #include "util.h"
100 #include "utmp-wtmp.h"
101
102 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
103 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
104
105 /* This assumes there is a 'tty' group */
106 #define TTY_MODE 0620
107
108 #define SNDBUF_SIZE (8*1024*1024)
109
110 static int shift_fds(int fds[], size_t n_fds) {
111 int start, restart_from;
112
113 if (n_fds <= 0)
114 return 0;
115
116 /* Modifies the fds array! (sorts it) */
117
118 assert(fds);
119
120 start = 0;
121 for (;;) {
122 int i;
123
124 restart_from = -1;
125
126 for (i = start; i < (int) n_fds; i++) {
127 int nfd;
128
129 /* Already at right index? */
130 if (fds[i] == i+3)
131 continue;
132
133 nfd = fcntl(fds[i], F_DUPFD, i + 3);
134 if (nfd < 0)
135 return -errno;
136
137 safe_close(fds[i]);
138 fds[i] = nfd;
139
140 /* Hmm, the fd we wanted isn't free? Then
141 * let's remember that and try again from here */
142 if (nfd != i+3 && restart_from < 0)
143 restart_from = i;
144 }
145
146 if (restart_from < 0)
147 break;
148
149 start = restart_from;
150 }
151
152 return 0;
153 }
154
155 static int flags_fds(const int fds[], size_t n_storage_fds, size_t n_socket_fds, bool nonblock) {
156 size_t i, n_fds;
157 int r;
158
159 n_fds = n_storage_fds + n_socket_fds;
160 if (n_fds <= 0)
161 return 0;
162
163 assert(fds);
164
165 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
166 * O_NONBLOCK only applies to socket activation though. */
167
168 for (i = 0; i < n_fds; i++) {
169
170 if (i < n_socket_fds) {
171 r = fd_nonblock(fds[i], nonblock);
172 if (r < 0)
173 return r;
174 }
175
176 /* We unconditionally drop FD_CLOEXEC from the fds,
177 * since after all we want to pass these fds to our
178 * children */
179
180 r = fd_cloexec(fds[i], false);
181 if (r < 0)
182 return r;
183 }
184
185 return 0;
186 }
187
188 static const char *exec_context_tty_path(const ExecContext *context) {
189 assert(context);
190
191 if (context->stdio_as_fds)
192 return NULL;
193
194 if (context->tty_path)
195 return context->tty_path;
196
197 return "/dev/console";
198 }
199
200 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
201 const char *path;
202
203 assert(context);
204
205 path = exec_context_tty_path(context);
206
207 if (context->tty_vhangup) {
208 if (p && p->stdin_fd >= 0)
209 (void) terminal_vhangup_fd(p->stdin_fd);
210 else if (path)
211 (void) terminal_vhangup(path);
212 }
213
214 if (context->tty_reset) {
215 if (p && p->stdin_fd >= 0)
216 (void) reset_terminal_fd(p->stdin_fd, true);
217 else if (path)
218 (void) reset_terminal(path);
219 }
220
221 if (context->tty_vt_disallocate && path)
222 (void) vt_disallocate(path);
223 }
224
225 static bool is_terminal_input(ExecInput i) {
226 return IN_SET(i,
227 EXEC_INPUT_TTY,
228 EXEC_INPUT_TTY_FORCE,
229 EXEC_INPUT_TTY_FAIL);
230 }
231
232 static bool is_terminal_output(ExecOutput o) {
233 return IN_SET(o,
234 EXEC_OUTPUT_TTY,
235 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
236 EXEC_OUTPUT_KMSG_AND_CONSOLE,
237 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
238 }
239
240 static bool is_syslog_output(ExecOutput o) {
241 return IN_SET(o,
242 EXEC_OUTPUT_SYSLOG,
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
244 }
245
246 static bool is_kmsg_output(ExecOutput o) {
247 return IN_SET(o,
248 EXEC_OUTPUT_KMSG,
249 EXEC_OUTPUT_KMSG_AND_CONSOLE);
250 }
251
252 static bool exec_context_needs_term(const ExecContext *c) {
253 assert(c);
254
255 /* Return true if the execution context suggests we should set $TERM to something useful. */
256
257 if (is_terminal_input(c->std_input))
258 return true;
259
260 if (is_terminal_output(c->std_output))
261 return true;
262
263 if (is_terminal_output(c->std_error))
264 return true;
265
266 return !!c->tty_path;
267 }
268
269 static int open_null_as(int flags, int nfd) {
270 int fd;
271
272 assert(nfd >= 0);
273
274 fd = open("/dev/null", flags|O_NOCTTY);
275 if (fd < 0)
276 return -errno;
277
278 return move_fd(fd, nfd, false);
279 }
280
281 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
282 static const union sockaddr_union sa = {
283 .un.sun_family = AF_UNIX,
284 .un.sun_path = "/run/systemd/journal/stdout",
285 };
286 uid_t olduid = UID_INVALID;
287 gid_t oldgid = GID_INVALID;
288 int r;
289
290 if (gid_is_valid(gid)) {
291 oldgid = getgid();
292
293 if (setegid(gid) < 0)
294 return -errno;
295 }
296
297 if (uid_is_valid(uid)) {
298 olduid = getuid();
299
300 if (seteuid(uid) < 0) {
301 r = -errno;
302 goto restore_gid;
303 }
304 }
305
306 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
307
308 /* If we fail to restore the uid or gid, things will likely
309 fail later on. This should only happen if an LSM interferes. */
310
311 if (uid_is_valid(uid))
312 (void) seteuid(olduid);
313
314 restore_gid:
315 if (gid_is_valid(gid))
316 (void) setegid(oldgid);
317
318 return r;
319 }
320
321 static int connect_logger_as(
322 const Unit *unit,
323 const ExecContext *context,
324 const ExecParameters *params,
325 ExecOutput output,
326 const char *ident,
327 int nfd,
328 uid_t uid,
329 gid_t gid) {
330
331 int fd, r;
332
333 assert(context);
334 assert(params);
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
338
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
341 return -errno;
342
343 r = connect_journal_socket(fd, uid, gid);
344 if (r < 0)
345 return r;
346
347 if (shutdown(fd, SHUT_RD) < 0) {
348 safe_close(fd);
349 return -errno;
350 }
351
352 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
353
354 dprintf(fd,
355 "%s\n"
356 "%s\n"
357 "%i\n"
358 "%i\n"
359 "%i\n"
360 "%i\n"
361 "%i\n",
362 context->syslog_identifier ?: ident,
363 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
364 context->syslog_priority,
365 !!context->syslog_level_prefix,
366 is_syslog_output(output),
367 is_kmsg_output(output),
368 is_terminal_output(output));
369
370 return move_fd(fd, nfd, false);
371 }
372 static int open_terminal_as(const char *path, int flags, int nfd) {
373 int fd;
374
375 assert(path);
376 assert(nfd >= 0);
377
378 fd = open_terminal(path, flags | O_NOCTTY);
379 if (fd < 0)
380 return fd;
381
382 return move_fd(fd, nfd, false);
383 }
384
385 static int acquire_path(const char *path, int flags, mode_t mode) {
386 union sockaddr_union sa = {
387 .sa.sa_family = AF_UNIX,
388 };
389 int fd, r;
390
391 assert(path);
392
393 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
394 flags |= O_CREAT;
395
396 fd = open(path, flags|O_NOCTTY, mode);
397 if (fd >= 0)
398 return fd;
399
400 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
401 return -errno;
402 if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
403 return -ENXIO;
404
405 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
406
407 fd = socket(AF_UNIX, SOCK_STREAM, 0);
408 if (fd < 0)
409 return -errno;
410
411 strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
412 if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
413 safe_close(fd);
414 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
415 * indication that his wasn't an AF_UNIX socket after all */
416 }
417
418 if ((flags & O_ACCMODE) == O_RDONLY)
419 r = shutdown(fd, SHUT_WR);
420 else if ((flags & O_ACCMODE) == O_WRONLY)
421 r = shutdown(fd, SHUT_RD);
422 else
423 return fd;
424 if (r < 0) {
425 safe_close(fd);
426 return -errno;
427 }
428
429 return fd;
430 }
431
432 static int fixup_input(
433 const ExecContext *context,
434 int socket_fd,
435 bool apply_tty_stdin) {
436
437 ExecInput std_input;
438
439 assert(context);
440
441 std_input = context->std_input;
442
443 if (is_terminal_input(std_input) && !apply_tty_stdin)
444 return EXEC_INPUT_NULL;
445
446 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
447 return EXEC_INPUT_NULL;
448
449 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
450 return EXEC_INPUT_NULL;
451
452 return std_input;
453 }
454
455 static int fixup_output(ExecOutput std_output, int socket_fd) {
456
457 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
458 return EXEC_OUTPUT_INHERIT;
459
460 return std_output;
461 }
462
463 static int setup_input(
464 const ExecContext *context,
465 const ExecParameters *params,
466 int socket_fd,
467 int named_iofds[3]) {
468
469 ExecInput i;
470
471 assert(context);
472 assert(params);
473
474 if (params->stdin_fd >= 0) {
475 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
476 return -errno;
477
478 /* Try to make this the controlling tty, if it is a tty, and reset it */
479 if (isatty(STDIN_FILENO)) {
480 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
481 (void) reset_terminal_fd(STDIN_FILENO, true);
482 }
483
484 return STDIN_FILENO;
485 }
486
487 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
488
489 switch (i) {
490
491 case EXEC_INPUT_NULL:
492 return open_null_as(O_RDONLY, STDIN_FILENO);
493
494 case EXEC_INPUT_TTY:
495 case EXEC_INPUT_TTY_FORCE:
496 case EXEC_INPUT_TTY_FAIL: {
497 int fd;
498
499 fd = acquire_terminal(exec_context_tty_path(context),
500 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
501 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
502 ACQUIRE_TERMINAL_WAIT,
503 USEC_INFINITY);
504 if (fd < 0)
505 return fd;
506
507 return move_fd(fd, STDIN_FILENO, false);
508 }
509
510 case EXEC_INPUT_SOCKET:
511 assert(socket_fd >= 0);
512
513 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
514
515 case EXEC_INPUT_NAMED_FD:
516 assert(named_iofds[STDIN_FILENO] >= 0);
517
518 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
519 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
520
521 case EXEC_INPUT_DATA: {
522 int fd;
523
524 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
525 if (fd < 0)
526 return fd;
527
528 return move_fd(fd, STDIN_FILENO, false);
529 }
530
531 case EXEC_INPUT_FILE: {
532 bool rw;
533 int fd;
534
535 assert(context->stdio_file[STDIN_FILENO]);
536
537 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
538 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
539
540 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
541 if (fd < 0)
542 return fd;
543
544 return move_fd(fd, STDIN_FILENO, false);
545 }
546
547 default:
548 assert_not_reached("Unknown input type");
549 }
550 }
551
552 static int setup_output(
553 const Unit *unit,
554 const ExecContext *context,
555 const ExecParameters *params,
556 int fileno,
557 int socket_fd,
558 int named_iofds[3],
559 const char *ident,
560 uid_t uid,
561 gid_t gid,
562 dev_t *journal_stream_dev,
563 ino_t *journal_stream_ino) {
564
565 ExecOutput o;
566 ExecInput i;
567 int r;
568
569 assert(unit);
570 assert(context);
571 assert(params);
572 assert(ident);
573 assert(journal_stream_dev);
574 assert(journal_stream_ino);
575
576 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
577
578 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
579 return -errno;
580
581 return STDOUT_FILENO;
582 }
583
584 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
585 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
586 return -errno;
587
588 return STDERR_FILENO;
589 }
590
591 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
592 o = fixup_output(context->std_output, socket_fd);
593
594 if (fileno == STDERR_FILENO) {
595 ExecOutput e;
596 e = fixup_output(context->std_error, socket_fd);
597
598 /* This expects the input and output are already set up */
599
600 /* Don't change the stderr file descriptor if we inherit all
601 * the way and are not on a tty */
602 if (e == EXEC_OUTPUT_INHERIT &&
603 o == EXEC_OUTPUT_INHERIT &&
604 i == EXEC_INPUT_NULL &&
605 !is_terminal_input(context->std_input) &&
606 getppid () != 1)
607 return fileno;
608
609 /* Duplicate from stdout if possible */
610 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
611 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
612
613 o = e;
614
615 } else if (o == EXEC_OUTPUT_INHERIT) {
616 /* If input got downgraded, inherit the original value */
617 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
618 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
619
620 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
621 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
622 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
623
624 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
625 if (getppid() != 1)
626 return fileno;
627
628 /* We need to open /dev/null here anew, to get the right access mode. */
629 return open_null_as(O_WRONLY, fileno);
630 }
631
632 switch (o) {
633
634 case EXEC_OUTPUT_NULL:
635 return open_null_as(O_WRONLY, fileno);
636
637 case EXEC_OUTPUT_TTY:
638 if (is_terminal_input(i))
639 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
640
641 /* We don't reset the terminal if this is just about output */
642 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
643
644 case EXEC_OUTPUT_SYSLOG:
645 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
646 case EXEC_OUTPUT_KMSG:
647 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
648 case EXEC_OUTPUT_JOURNAL:
649 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
650 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
651 if (r < 0) {
652 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
653 r = open_null_as(O_WRONLY, fileno);
654 } else {
655 struct stat st;
656
657 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
658 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
659 * services to detect whether they are connected to the journal or not.
660 *
661 * If both stdout and stderr are connected to a stream then let's make sure to store the data
662 * about STDERR as that's usually the best way to do logging. */
663
664 if (fstat(fileno, &st) >= 0 &&
665 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
666 *journal_stream_dev = st.st_dev;
667 *journal_stream_ino = st.st_ino;
668 }
669 }
670 return r;
671
672 case EXEC_OUTPUT_SOCKET:
673 assert(socket_fd >= 0);
674
675 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
676
677 case EXEC_OUTPUT_NAMED_FD:
678 assert(named_iofds[fileno] >= 0);
679
680 (void) fd_nonblock(named_iofds[fileno], false);
681 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
682
683 case EXEC_OUTPUT_FILE: {
684 bool rw;
685 int fd;
686
687 assert(context->stdio_file[fileno]);
688
689 rw = context->std_input == EXEC_INPUT_FILE &&
690 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
691
692 if (rw)
693 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
694
695 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
696 if (fd < 0)
697 return fd;
698
699 return move_fd(fd, fileno, false);
700 }
701
702 default:
703 assert_not_reached("Unknown error type");
704 }
705 }
706
707 static int chown_terminal(int fd, uid_t uid) {
708 struct stat st;
709
710 assert(fd >= 0);
711
712 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
713 if (isatty(fd) < 1)
714 return 0;
715
716 /* This might fail. What matters are the results. */
717 (void) fchown(fd, uid, -1);
718 (void) fchmod(fd, TTY_MODE);
719
720 if (fstat(fd, &st) < 0)
721 return -errno;
722
723 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
724 return -EPERM;
725
726 return 0;
727 }
728
729 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
730 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
731 int r;
732
733 assert(_saved_stdin);
734 assert(_saved_stdout);
735
736 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
737 if (saved_stdin < 0)
738 return -errno;
739
740 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
741 if (saved_stdout < 0)
742 return -errno;
743
744 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
745 if (fd < 0)
746 return fd;
747
748 r = chown_terminal(fd, getuid());
749 if (r < 0)
750 return r;
751
752 r = reset_terminal_fd(fd, true);
753 if (r < 0)
754 return r;
755
756 r = rearrange_stdio(fd, fd, STDERR_FILENO);
757 fd = -1;
758 if (r < 0)
759 return r;
760
761 *_saved_stdin = saved_stdin;
762 *_saved_stdout = saved_stdout;
763
764 saved_stdin = saved_stdout = -1;
765
766 return 0;
767 }
768
769 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
770 assert(err < 0);
771
772 if (err == -ETIMEDOUT)
773 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
774 else {
775 errno = -err;
776 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
777 }
778 }
779
780 static void write_confirm_error(int err, const char *vc, const Unit *u) {
781 _cleanup_close_ int fd = -1;
782
783 assert(vc);
784
785 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
786 if (fd < 0)
787 return;
788
789 write_confirm_error_fd(err, fd, u);
790 }
791
792 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
793 int r = 0;
794
795 assert(saved_stdin);
796 assert(saved_stdout);
797
798 release_terminal();
799
800 if (*saved_stdin >= 0)
801 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
802 r = -errno;
803
804 if (*saved_stdout >= 0)
805 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
806 r = -errno;
807
808 *saved_stdin = safe_close(*saved_stdin);
809 *saved_stdout = safe_close(*saved_stdout);
810
811 return r;
812 }
813
814 enum {
815 CONFIRM_PRETEND_FAILURE = -1,
816 CONFIRM_PRETEND_SUCCESS = 0,
817 CONFIRM_EXECUTE = 1,
818 };
819
820 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
821 int saved_stdout = -1, saved_stdin = -1, r;
822 _cleanup_free_ char *e = NULL;
823 char c;
824
825 /* For any internal errors, assume a positive response. */
826 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
827 if (r < 0) {
828 write_confirm_error(r, vc, u);
829 return CONFIRM_EXECUTE;
830 }
831
832 /* confirm_spawn might have been disabled while we were sleeping. */
833 if (manager_is_confirm_spawn_disabled(u->manager)) {
834 r = 1;
835 goto restore_stdio;
836 }
837
838 e = ellipsize(cmdline, 60, 100);
839 if (!e) {
840 log_oom();
841 r = CONFIRM_EXECUTE;
842 goto restore_stdio;
843 }
844
845 for (;;) {
846 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
847 if (r < 0) {
848 write_confirm_error_fd(r, STDOUT_FILENO, u);
849 r = CONFIRM_EXECUTE;
850 goto restore_stdio;
851 }
852
853 switch (c) {
854 case 'c':
855 printf("Resuming normal execution.\n");
856 manager_disable_confirm_spawn();
857 r = 1;
858 break;
859 case 'D':
860 unit_dump(u, stdout, " ");
861 continue; /* ask again */
862 case 'f':
863 printf("Failing execution.\n");
864 r = CONFIRM_PRETEND_FAILURE;
865 break;
866 case 'h':
867 printf(" c - continue, proceed without asking anymore\n"
868 " D - dump, show the state of the unit\n"
869 " f - fail, don't execute the command and pretend it failed\n"
870 " h - help\n"
871 " i - info, show a short summary of the unit\n"
872 " j - jobs, show jobs that are in progress\n"
873 " s - skip, don't execute the command and pretend it succeeded\n"
874 " y - yes, execute the command\n");
875 continue; /* ask again */
876 case 'i':
877 printf(" Description: %s\n"
878 " Unit: %s\n"
879 " Command: %s\n",
880 u->id, u->description, cmdline);
881 continue; /* ask again */
882 case 'j':
883 manager_dump_jobs(u->manager, stdout, " ");
884 continue; /* ask again */
885 case 'n':
886 /* 'n' was removed in favor of 'f'. */
887 printf("Didn't understand 'n', did you mean 'f'?\n");
888 continue; /* ask again */
889 case 's':
890 printf("Skipping execution.\n");
891 r = CONFIRM_PRETEND_SUCCESS;
892 break;
893 case 'y':
894 r = CONFIRM_EXECUTE;
895 break;
896 default:
897 assert_not_reached("Unhandled choice");
898 }
899 break;
900 }
901
902 restore_stdio:
903 restore_confirm_stdio(&saved_stdin, &saved_stdout);
904 return r;
905 }
906
907 static int get_fixed_user(const ExecContext *c, const char **user,
908 uid_t *uid, gid_t *gid,
909 const char **home, const char **shell) {
910 int r;
911 const char *name;
912
913 assert(c);
914
915 if (!c->user)
916 return 0;
917
918 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
919 * (i.e. are "/" or "/bin/nologin"). */
920
921 name = c->user;
922 r = get_user_creds_clean(&name, uid, gid, home, shell);
923 if (r < 0)
924 return r;
925
926 *user = name;
927 return 0;
928 }
929
930 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
931 int r;
932 const char *name;
933
934 assert(c);
935
936 if (!c->group)
937 return 0;
938
939 name = c->group;
940 r = get_group_creds(&name, gid);
941 if (r < 0)
942 return r;
943
944 *group = name;
945 return 0;
946 }
947
948 static int get_supplementary_groups(const ExecContext *c, const char *user,
949 const char *group, gid_t gid,
950 gid_t **supplementary_gids, int *ngids) {
951 char **i;
952 int r, k = 0;
953 int ngroups_max;
954 bool keep_groups = false;
955 gid_t *groups = NULL;
956 _cleanup_free_ gid_t *l_gids = NULL;
957
958 assert(c);
959
960 /*
961 * If user is given, then lookup GID and supplementary groups list.
962 * We avoid NSS lookups for gid=0. Also we have to initialize groups
963 * here and as early as possible so we keep the list of supplementary
964 * groups of the caller.
965 */
966 if (user && gid_is_valid(gid) && gid != 0) {
967 /* First step, initialize groups from /etc/groups */
968 if (initgroups(user, gid) < 0)
969 return -errno;
970
971 keep_groups = true;
972 }
973
974 if (strv_isempty(c->supplementary_groups))
975 return 0;
976
977 /*
978 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
979 * be positive, otherwise fail.
980 */
981 errno = 0;
982 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
983 if (ngroups_max <= 0) {
984 if (errno > 0)
985 return -errno;
986 else
987 return -EOPNOTSUPP; /* For all other values */
988 }
989
990 l_gids = new(gid_t, ngroups_max);
991 if (!l_gids)
992 return -ENOMEM;
993
994 if (keep_groups) {
995 /*
996 * Lookup the list of groups that the user belongs to, we
997 * avoid NSS lookups here too for gid=0.
998 */
999 k = ngroups_max;
1000 if (getgrouplist(user, gid, l_gids, &k) < 0)
1001 return -EINVAL;
1002 } else
1003 k = 0;
1004
1005 STRV_FOREACH(i, c->supplementary_groups) {
1006 const char *g;
1007
1008 if (k >= ngroups_max)
1009 return -E2BIG;
1010
1011 g = *i;
1012 r = get_group_creds(&g, l_gids+k);
1013 if (r < 0)
1014 return r;
1015
1016 k++;
1017 }
1018
1019 /*
1020 * Sets ngids to zero to drop all supplementary groups, happens
1021 * when we are under root and SupplementaryGroups= is empty.
1022 */
1023 if (k == 0) {
1024 *ngids = 0;
1025 return 0;
1026 }
1027
1028 /* Otherwise get the final list of supplementary groups */
1029 groups = memdup(l_gids, sizeof(gid_t) * k);
1030 if (!groups)
1031 return -ENOMEM;
1032
1033 *supplementary_gids = groups;
1034 *ngids = k;
1035
1036 groups = NULL;
1037
1038 return 0;
1039 }
1040
1041 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1042 int r;
1043
1044 /* Handle SupplementaryGroups= if it is not empty */
1045 if (ngids > 0) {
1046 r = maybe_setgroups(ngids, supplementary_gids);
1047 if (r < 0)
1048 return r;
1049 }
1050
1051 if (gid_is_valid(gid)) {
1052 /* Then set our gids */
1053 if (setresgid(gid, gid, gid) < 0)
1054 return -errno;
1055 }
1056
1057 return 0;
1058 }
1059
1060 static int enforce_user(const ExecContext *context, uid_t uid) {
1061 assert(context);
1062
1063 if (!uid_is_valid(uid))
1064 return 0;
1065
1066 /* Sets (but doesn't look up) the uid and make sure we keep the
1067 * capabilities while doing so. */
1068
1069 if (context->capability_ambient_set != 0) {
1070
1071 /* First step: If we need to keep capabilities but
1072 * drop privileges we need to make sure we keep our
1073 * caps, while we drop privileges. */
1074 if (uid != 0) {
1075 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1076
1077 if (prctl(PR_GET_SECUREBITS) != sb)
1078 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1079 return -errno;
1080 }
1081 }
1082
1083 /* Second step: actually set the uids */
1084 if (setresuid(uid, uid, uid) < 0)
1085 return -errno;
1086
1087 /* At this point we should have all necessary capabilities but
1088 are otherwise a normal user. However, the caps might got
1089 corrupted due to the setresuid() so we need clean them up
1090 later. This is done outside of this call. */
1091
1092 return 0;
1093 }
1094
1095 #if HAVE_PAM
1096
1097 static int null_conv(
1098 int num_msg,
1099 const struct pam_message **msg,
1100 struct pam_response **resp,
1101 void *appdata_ptr) {
1102
1103 /* We don't support conversations */
1104
1105 return PAM_CONV_ERR;
1106 }
1107
1108 #endif
1109
1110 static int setup_pam(
1111 const char *name,
1112 const char *user,
1113 uid_t uid,
1114 gid_t gid,
1115 const char *tty,
1116 char ***env,
1117 int fds[], size_t n_fds) {
1118
1119 #if HAVE_PAM
1120
1121 static const struct pam_conv conv = {
1122 .conv = null_conv,
1123 .appdata_ptr = NULL
1124 };
1125
1126 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1127 pam_handle_t *handle = NULL;
1128 sigset_t old_ss;
1129 int pam_code = PAM_SUCCESS, r;
1130 char **nv, **e = NULL;
1131 bool close_session = false;
1132 pid_t pam_pid = 0, parent_pid;
1133 int flags = 0;
1134
1135 assert(name);
1136 assert(user);
1137 assert(env);
1138
1139 /* We set up PAM in the parent process, then fork. The child
1140 * will then stay around until killed via PR_GET_PDEATHSIG or
1141 * systemd via the cgroup logic. It will then remove the PAM
1142 * session again. The parent process will exec() the actual
1143 * daemon. We do things this way to ensure that the main PID
1144 * of the daemon is the one we initially fork()ed. */
1145
1146 r = barrier_create(&barrier);
1147 if (r < 0)
1148 goto fail;
1149
1150 if (log_get_max_level() < LOG_DEBUG)
1151 flags |= PAM_SILENT;
1152
1153 pam_code = pam_start(name, user, &conv, &handle);
1154 if (pam_code != PAM_SUCCESS) {
1155 handle = NULL;
1156 goto fail;
1157 }
1158
1159 if (tty) {
1160 pam_code = pam_set_item(handle, PAM_TTY, tty);
1161 if (pam_code != PAM_SUCCESS)
1162 goto fail;
1163 }
1164
1165 STRV_FOREACH(nv, *env) {
1166 pam_code = pam_putenv(handle, *nv);
1167 if (pam_code != PAM_SUCCESS)
1168 goto fail;
1169 }
1170
1171 pam_code = pam_acct_mgmt(handle, flags);
1172 if (pam_code != PAM_SUCCESS)
1173 goto fail;
1174
1175 pam_code = pam_open_session(handle, flags);
1176 if (pam_code != PAM_SUCCESS)
1177 goto fail;
1178
1179 close_session = true;
1180
1181 e = pam_getenvlist(handle);
1182 if (!e) {
1183 pam_code = PAM_BUF_ERR;
1184 goto fail;
1185 }
1186
1187 /* Block SIGTERM, so that we know that it won't get lost in
1188 * the child */
1189
1190 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1191
1192 parent_pid = getpid_cached();
1193
1194 r = safe_fork("(sd-pam)", 0, &pam_pid);
1195 if (r < 0)
1196 goto fail;
1197 if (r == 0) {
1198 int sig, ret = EXIT_PAM;
1199
1200 /* The child's job is to reset the PAM session on
1201 * termination */
1202 barrier_set_role(&barrier, BARRIER_CHILD);
1203
1204 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1205 * are open here that have been opened by PAM. */
1206 (void) close_many(fds, n_fds);
1207
1208 /* Drop privileges - we don't need any to pam_close_session
1209 * and this will make PR_SET_PDEATHSIG work in most cases.
1210 * If this fails, ignore the error - but expect sd-pam threads
1211 * to fail to exit normally */
1212
1213 r = maybe_setgroups(0, NULL);
1214 if (r < 0)
1215 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1216 if (setresgid(gid, gid, gid) < 0)
1217 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1218 if (setresuid(uid, uid, uid) < 0)
1219 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1220
1221 (void) ignore_signals(SIGPIPE, -1);
1222
1223 /* Wait until our parent died. This will only work if
1224 * the above setresuid() succeeds, otherwise the kernel
1225 * will not allow unprivileged parents kill their privileged
1226 * children this way. We rely on the control groups kill logic
1227 * to do the rest for us. */
1228 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1229 goto child_finish;
1230
1231 /* Tell the parent that our setup is done. This is especially
1232 * important regarding dropping privileges. Otherwise, unit
1233 * setup might race against our setresuid(2) call.
1234 *
1235 * If the parent aborted, we'll detect this below, hence ignore
1236 * return failure here. */
1237 (void) barrier_place(&barrier);
1238
1239 /* Check if our parent process might already have died? */
1240 if (getppid() == parent_pid) {
1241 sigset_t ss;
1242
1243 assert_se(sigemptyset(&ss) >= 0);
1244 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1245
1246 for (;;) {
1247 if (sigwait(&ss, &sig) < 0) {
1248 if (errno == EINTR)
1249 continue;
1250
1251 goto child_finish;
1252 }
1253
1254 assert(sig == SIGTERM);
1255 break;
1256 }
1257 }
1258
1259 /* If our parent died we'll end the session */
1260 if (getppid() != parent_pid) {
1261 pam_code = pam_close_session(handle, flags);
1262 if (pam_code != PAM_SUCCESS)
1263 goto child_finish;
1264 }
1265
1266 ret = 0;
1267
1268 child_finish:
1269 pam_end(handle, pam_code | flags);
1270 _exit(ret);
1271 }
1272
1273 barrier_set_role(&barrier, BARRIER_PARENT);
1274
1275 /* If the child was forked off successfully it will do all the
1276 * cleanups, so forget about the handle here. */
1277 handle = NULL;
1278
1279 /* Unblock SIGTERM again in the parent */
1280 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1281
1282 /* We close the log explicitly here, since the PAM modules
1283 * might have opened it, but we don't want this fd around. */
1284 closelog();
1285
1286 /* Synchronously wait for the child to initialize. We don't care for
1287 * errors as we cannot recover. However, warn loudly if it happens. */
1288 if (!barrier_place_and_sync(&barrier))
1289 log_error("PAM initialization failed");
1290
1291 return strv_free_and_replace(*env, e);
1292
1293 fail:
1294 if (pam_code != PAM_SUCCESS) {
1295 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1296 r = -EPERM; /* PAM errors do not map to errno */
1297 } else
1298 log_error_errno(r, "PAM failed: %m");
1299
1300 if (handle) {
1301 if (close_session)
1302 pam_code = pam_close_session(handle, flags);
1303
1304 pam_end(handle, pam_code | flags);
1305 }
1306
1307 strv_free(e);
1308 closelog();
1309
1310 return r;
1311 #else
1312 return 0;
1313 #endif
1314 }
1315
1316 static void rename_process_from_path(const char *path) {
1317 char process_name[11];
1318 const char *p;
1319 size_t l;
1320
1321 /* This resulting string must fit in 10 chars (i.e. the length
1322 * of "/sbin/init") to look pretty in /bin/ps */
1323
1324 p = basename(path);
1325 if (isempty(p)) {
1326 rename_process("(...)");
1327 return;
1328 }
1329
1330 l = strlen(p);
1331 if (l > 8) {
1332 /* The end of the process name is usually more
1333 * interesting, since the first bit might just be
1334 * "systemd-" */
1335 p = p + l - 8;
1336 l = 8;
1337 }
1338
1339 process_name[0] = '(';
1340 memcpy(process_name+1, p, l);
1341 process_name[1+l] = ')';
1342 process_name[1+l+1] = 0;
1343
1344 rename_process(process_name);
1345 }
1346
1347 static bool context_has_address_families(const ExecContext *c) {
1348 assert(c);
1349
1350 return c->address_families_whitelist ||
1351 !set_isempty(c->address_families);
1352 }
1353
1354 static bool context_has_syscall_filters(const ExecContext *c) {
1355 assert(c);
1356
1357 return c->syscall_whitelist ||
1358 !hashmap_isempty(c->syscall_filter);
1359 }
1360
1361 static bool context_has_no_new_privileges(const ExecContext *c) {
1362 assert(c);
1363
1364 if (c->no_new_privileges)
1365 return true;
1366
1367 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1368 return false;
1369
1370 /* We need NNP if we have any form of seccomp and are unprivileged */
1371 return context_has_address_families(c) ||
1372 c->memory_deny_write_execute ||
1373 c->restrict_realtime ||
1374 exec_context_restrict_namespaces_set(c) ||
1375 c->protect_kernel_tunables ||
1376 c->protect_kernel_modules ||
1377 c->private_devices ||
1378 context_has_syscall_filters(c) ||
1379 !set_isempty(c->syscall_archs) ||
1380 c->lock_personality;
1381 }
1382
1383 #if HAVE_SECCOMP
1384
1385 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1386
1387 if (is_seccomp_available())
1388 return false;
1389
1390 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1391 return true;
1392 }
1393
1394 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1395 uint32_t negative_action, default_action, action;
1396 int r;
1397
1398 assert(u);
1399 assert(c);
1400
1401 if (!context_has_syscall_filters(c))
1402 return 0;
1403
1404 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1405 return 0;
1406
1407 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1408
1409 if (c->syscall_whitelist) {
1410 default_action = negative_action;
1411 action = SCMP_ACT_ALLOW;
1412 } else {
1413 default_action = SCMP_ACT_ALLOW;
1414 action = negative_action;
1415 }
1416
1417 if (needs_ambient_hack) {
1418 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1419 if (r < 0)
1420 return r;
1421 }
1422
1423 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1424 }
1425
1426 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1427 assert(u);
1428 assert(c);
1429
1430 if (set_isempty(c->syscall_archs))
1431 return 0;
1432
1433 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1434 return 0;
1435
1436 return seccomp_restrict_archs(c->syscall_archs);
1437 }
1438
1439 static int apply_address_families(const Unit* u, const ExecContext *c) {
1440 assert(u);
1441 assert(c);
1442
1443 if (!context_has_address_families(c))
1444 return 0;
1445
1446 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1447 return 0;
1448
1449 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1450 }
1451
1452 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1453 assert(u);
1454 assert(c);
1455
1456 if (!c->memory_deny_write_execute)
1457 return 0;
1458
1459 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1460 return 0;
1461
1462 return seccomp_memory_deny_write_execute();
1463 }
1464
1465 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1466 assert(u);
1467 assert(c);
1468
1469 if (!c->restrict_realtime)
1470 return 0;
1471
1472 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1473 return 0;
1474
1475 return seccomp_restrict_realtime();
1476 }
1477
1478 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1479 assert(u);
1480 assert(c);
1481
1482 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1483 * let's protect even those systems where this is left on in the kernel. */
1484
1485 if (!c->protect_kernel_tunables)
1486 return 0;
1487
1488 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1489 return 0;
1490
1491 return seccomp_protect_sysctl();
1492 }
1493
1494 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1495 assert(u);
1496 assert(c);
1497
1498 /* Turn off module syscalls on ProtectKernelModules=yes */
1499
1500 if (!c->protect_kernel_modules)
1501 return 0;
1502
1503 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1504 return 0;
1505
1506 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1507 }
1508
1509 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1510 assert(u);
1511 assert(c);
1512
1513 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1514
1515 if (!c->private_devices)
1516 return 0;
1517
1518 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1519 return 0;
1520
1521 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1522 }
1523
1524 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1525 assert(u);
1526 assert(c);
1527
1528 if (!exec_context_restrict_namespaces_set(c))
1529 return 0;
1530
1531 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1532 return 0;
1533
1534 return seccomp_restrict_namespaces(c->restrict_namespaces);
1535 }
1536
1537 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1538 unsigned long personality;
1539 int r;
1540
1541 assert(u);
1542 assert(c);
1543
1544 if (!c->lock_personality)
1545 return 0;
1546
1547 if (skip_seccomp_unavailable(u, "LockPersonality="))
1548 return 0;
1549
1550 personality = c->personality;
1551
1552 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1553 if (personality == PERSONALITY_INVALID) {
1554
1555 r = opinionated_personality(&personality);
1556 if (r < 0)
1557 return r;
1558 }
1559
1560 return seccomp_lock_personality(personality);
1561 }
1562
1563 #endif
1564
1565 static void do_idle_pipe_dance(int idle_pipe[4]) {
1566 assert(idle_pipe);
1567
1568 idle_pipe[1] = safe_close(idle_pipe[1]);
1569 idle_pipe[2] = safe_close(idle_pipe[2]);
1570
1571 if (idle_pipe[0] >= 0) {
1572 int r;
1573
1574 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1575
1576 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1577 ssize_t n;
1578
1579 /* Signal systemd that we are bored and want to continue. */
1580 n = write(idle_pipe[3], "x", 1);
1581 if (n > 0)
1582 /* Wait for systemd to react to the signal above. */
1583 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1584 }
1585
1586 idle_pipe[0] = safe_close(idle_pipe[0]);
1587
1588 }
1589
1590 idle_pipe[3] = safe_close(idle_pipe[3]);
1591 }
1592
1593 static int build_environment(
1594 const Unit *u,
1595 const ExecContext *c,
1596 const ExecParameters *p,
1597 size_t n_fds,
1598 const char *home,
1599 const char *username,
1600 const char *shell,
1601 dev_t journal_stream_dev,
1602 ino_t journal_stream_ino,
1603 char ***ret) {
1604
1605 _cleanup_strv_free_ char **our_env = NULL;
1606 size_t n_env = 0;
1607 char *x;
1608
1609 assert(u);
1610 assert(c);
1611 assert(ret);
1612
1613 our_env = new0(char*, 14);
1614 if (!our_env)
1615 return -ENOMEM;
1616
1617 if (n_fds > 0) {
1618 _cleanup_free_ char *joined = NULL;
1619
1620 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1621 return -ENOMEM;
1622 our_env[n_env++] = x;
1623
1624 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1625 return -ENOMEM;
1626 our_env[n_env++] = x;
1627
1628 joined = strv_join(p->fd_names, ":");
1629 if (!joined)
1630 return -ENOMEM;
1631
1632 x = strjoin("LISTEN_FDNAMES=", joined);
1633 if (!x)
1634 return -ENOMEM;
1635 our_env[n_env++] = x;
1636 }
1637
1638 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1639 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1640 return -ENOMEM;
1641 our_env[n_env++] = x;
1642
1643 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1644 return -ENOMEM;
1645 our_env[n_env++] = x;
1646 }
1647
1648 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1649 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1650 * check the database directly. */
1651 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1652 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1653 if (!x)
1654 return -ENOMEM;
1655 our_env[n_env++] = x;
1656 }
1657
1658 if (home) {
1659 x = strappend("HOME=", home);
1660 if (!x)
1661 return -ENOMEM;
1662 our_env[n_env++] = x;
1663 }
1664
1665 if (username) {
1666 x = strappend("LOGNAME=", username);
1667 if (!x)
1668 return -ENOMEM;
1669 our_env[n_env++] = x;
1670
1671 x = strappend("USER=", username);
1672 if (!x)
1673 return -ENOMEM;
1674 our_env[n_env++] = x;
1675 }
1676
1677 if (shell) {
1678 x = strappend("SHELL=", shell);
1679 if (!x)
1680 return -ENOMEM;
1681 our_env[n_env++] = x;
1682 }
1683
1684 if (!sd_id128_is_null(u->invocation_id)) {
1685 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1686 return -ENOMEM;
1687
1688 our_env[n_env++] = x;
1689 }
1690
1691 if (exec_context_needs_term(c)) {
1692 const char *tty_path, *term = NULL;
1693
1694 tty_path = exec_context_tty_path(c);
1695
1696 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1697 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1698 * passes to PID 1 ends up all the way in the console login shown. */
1699
1700 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1701 term = getenv("TERM");
1702 if (!term)
1703 term = default_term_for_tty(tty_path);
1704
1705 x = strappend("TERM=", term);
1706 if (!x)
1707 return -ENOMEM;
1708 our_env[n_env++] = x;
1709 }
1710
1711 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1712 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1713 return -ENOMEM;
1714
1715 our_env[n_env++] = x;
1716 }
1717
1718 our_env[n_env++] = NULL;
1719 assert(n_env <= 12);
1720
1721 *ret = TAKE_PTR(our_env);
1722
1723 return 0;
1724 }
1725
1726 static int build_pass_environment(const ExecContext *c, char ***ret) {
1727 _cleanup_strv_free_ char **pass_env = NULL;
1728 size_t n_env = 0, n_bufsize = 0;
1729 char **i;
1730
1731 STRV_FOREACH(i, c->pass_environment) {
1732 _cleanup_free_ char *x = NULL;
1733 char *v;
1734
1735 v = getenv(*i);
1736 if (!v)
1737 continue;
1738 x = strjoin(*i, "=", v);
1739 if (!x)
1740 return -ENOMEM;
1741
1742 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1743 return -ENOMEM;
1744
1745 pass_env[n_env++] = TAKE_PTR(x);
1746 pass_env[n_env] = NULL;
1747 }
1748
1749 *ret = TAKE_PTR(pass_env);
1750
1751 return 0;
1752 }
1753
1754 static bool exec_needs_mount_namespace(
1755 const ExecContext *context,
1756 const ExecParameters *params,
1757 const ExecRuntime *runtime) {
1758
1759 assert(context);
1760 assert(params);
1761
1762 if (context->root_image)
1763 return true;
1764
1765 if (!strv_isempty(context->read_write_paths) ||
1766 !strv_isempty(context->read_only_paths) ||
1767 !strv_isempty(context->inaccessible_paths))
1768 return true;
1769
1770 if (context->n_bind_mounts > 0)
1771 return true;
1772
1773 if (context->n_temporary_filesystems > 0)
1774 return true;
1775
1776 if (context->mount_flags != 0)
1777 return true;
1778
1779 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1780 return true;
1781
1782 if (context->private_devices ||
1783 context->protect_system != PROTECT_SYSTEM_NO ||
1784 context->protect_home != PROTECT_HOME_NO ||
1785 context->protect_kernel_tunables ||
1786 context->protect_kernel_modules ||
1787 context->protect_control_groups)
1788 return true;
1789
1790 if (context->mount_apivfs && (context->root_image || context->root_directory))
1791 return true;
1792
1793 if (context->dynamic_user &&
1794 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1795 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1796 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1797 return true;
1798
1799 return false;
1800 }
1801
1802 static int setup_private_users(uid_t uid, gid_t gid) {
1803 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1804 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1805 _cleanup_close_ int unshare_ready_fd = -1;
1806 _cleanup_(sigkill_waitp) pid_t pid = 0;
1807 uint64_t c = 1;
1808 ssize_t n;
1809 int r;
1810
1811 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1812 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1813 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1814 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1815 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1816 * continues execution normally. */
1817
1818 if (uid != 0 && uid_is_valid(uid)) {
1819 r = asprintf(&uid_map,
1820 "0 0 1\n" /* Map root → root */
1821 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1822 uid, uid);
1823 if (r < 0)
1824 return -ENOMEM;
1825 } else {
1826 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1827 if (!uid_map)
1828 return -ENOMEM;
1829 }
1830
1831 if (gid != 0 && gid_is_valid(gid)) {
1832 r = asprintf(&gid_map,
1833 "0 0 1\n" /* Map root → root */
1834 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1835 gid, gid);
1836 if (r < 0)
1837 return -ENOMEM;
1838 } else {
1839 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1840 if (!gid_map)
1841 return -ENOMEM;
1842 }
1843
1844 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1845 * namespace. */
1846 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1847 if (unshare_ready_fd < 0)
1848 return -errno;
1849
1850 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1851 * failed. */
1852 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1853 return -errno;
1854
1855 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1856 if (r < 0)
1857 return r;
1858 if (r == 0) {
1859 _cleanup_close_ int fd = -1;
1860 const char *a;
1861 pid_t ppid;
1862
1863 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1864 * here, after the parent opened its own user namespace. */
1865
1866 ppid = getppid();
1867 errno_pipe[0] = safe_close(errno_pipe[0]);
1868
1869 /* Wait until the parent unshared the user namespace */
1870 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1871 r = -errno;
1872 goto child_fail;
1873 }
1874
1875 /* Disable the setgroups() system call in the child user namespace, for good. */
1876 a = procfs_file_alloca(ppid, "setgroups");
1877 fd = open(a, O_WRONLY|O_CLOEXEC);
1878 if (fd < 0) {
1879 if (errno != ENOENT) {
1880 r = -errno;
1881 goto child_fail;
1882 }
1883
1884 /* If the file is missing the kernel is too old, let's continue anyway. */
1885 } else {
1886 if (write(fd, "deny\n", 5) < 0) {
1887 r = -errno;
1888 goto child_fail;
1889 }
1890
1891 fd = safe_close(fd);
1892 }
1893
1894 /* First write the GID map */
1895 a = procfs_file_alloca(ppid, "gid_map");
1896 fd = open(a, O_WRONLY|O_CLOEXEC);
1897 if (fd < 0) {
1898 r = -errno;
1899 goto child_fail;
1900 }
1901 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1902 r = -errno;
1903 goto child_fail;
1904 }
1905 fd = safe_close(fd);
1906
1907 /* The write the UID map */
1908 a = procfs_file_alloca(ppid, "uid_map");
1909 fd = open(a, O_WRONLY|O_CLOEXEC);
1910 if (fd < 0) {
1911 r = -errno;
1912 goto child_fail;
1913 }
1914 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1915 r = -errno;
1916 goto child_fail;
1917 }
1918
1919 _exit(EXIT_SUCCESS);
1920
1921 child_fail:
1922 (void) write(errno_pipe[1], &r, sizeof(r));
1923 _exit(EXIT_FAILURE);
1924 }
1925
1926 errno_pipe[1] = safe_close(errno_pipe[1]);
1927
1928 if (unshare(CLONE_NEWUSER) < 0)
1929 return -errno;
1930
1931 /* Let the child know that the namespace is ready now */
1932 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1933 return -errno;
1934
1935 /* Try to read an error code from the child */
1936 n = read(errno_pipe[0], &r, sizeof(r));
1937 if (n < 0)
1938 return -errno;
1939 if (n == sizeof(r)) { /* an error code was sent to us */
1940 if (r < 0)
1941 return r;
1942 return -EIO;
1943 }
1944 if (n != 0) /* on success we should have read 0 bytes */
1945 return -EIO;
1946
1947 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1948 pid = 0;
1949 if (r < 0)
1950 return r;
1951 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
1952 return -EIO;
1953
1954 return 0;
1955 }
1956
1957 static int setup_exec_directory(
1958 const ExecContext *context,
1959 const ExecParameters *params,
1960 uid_t uid,
1961 gid_t gid,
1962 ExecDirectoryType type,
1963 int *exit_status) {
1964
1965 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1966 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1967 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1968 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1969 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1970 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1971 };
1972 char **rt;
1973 int r;
1974
1975 assert(context);
1976 assert(params);
1977 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1978 assert(exit_status);
1979
1980 if (!params->prefix[type])
1981 return 0;
1982
1983 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1984 if (!uid_is_valid(uid))
1985 uid = 0;
1986 if (!gid_is_valid(gid))
1987 gid = 0;
1988 }
1989
1990 STRV_FOREACH(rt, context->directories[type].paths) {
1991 _cleanup_free_ char *p = NULL, *pp = NULL;
1992
1993 p = strjoin(params->prefix[type], "/", *rt);
1994 if (!p) {
1995 r = -ENOMEM;
1996 goto fail;
1997 }
1998
1999 r = mkdir_parents_label(p, 0755);
2000 if (r < 0)
2001 goto fail;
2002
2003 if (context->dynamic_user &&
2004 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2005 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2006
2007 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2008 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2009 * whose UID is later on reused. To lock this down we use the same trick used by container
2010 * managers to prohibit host users to get access to files of the same UID in containers: we
2011 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2012 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2013 * to make this directory permeable for the service itself.
2014 *
2015 * Specifically: for a service which wants a special directory "foo/" we first create a
2016 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2017 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2018 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2019 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2020 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2021 * disabling the access boundary for the service and making sure it only gets access to the
2022 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2023 *
2024 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2025 * owned by the service itself.
2026 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2027 * files or sockets with other services. */
2028
2029 private_root = strjoin(params->prefix[type], "/private");
2030 if (!private_root) {
2031 r = -ENOMEM;
2032 goto fail;
2033 }
2034
2035 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2036 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2037 if (r < 0)
2038 goto fail;
2039
2040 pp = strjoin(private_root, "/", *rt);
2041 if (!pp) {
2042 r = -ENOMEM;
2043 goto fail;
2044 }
2045
2046 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2047 r = mkdir_parents_label(pp, 0755);
2048 if (r < 0)
2049 goto fail;
2050
2051 if (is_dir(p, false) > 0 &&
2052 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2053
2054 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2055 * it over. Most likely the service has been upgraded from one that didn't use
2056 * DynamicUser=1, to one that does. */
2057
2058 if (rename(p, pp) < 0) {
2059 r = -errno;
2060 goto fail;
2061 }
2062 } else {
2063 /* Otherwise, create the actual directory for the service */
2064
2065 r = mkdir_label(pp, context->directories[type].mode);
2066 if (r < 0 && r != -EEXIST)
2067 goto fail;
2068 }
2069
2070 parent = dirname_malloc(p);
2071 if (!parent) {
2072 r = -ENOMEM;
2073 goto fail;
2074 }
2075
2076 r = path_make_relative(parent, pp, &relative);
2077 if (r < 0)
2078 goto fail;
2079
2080 /* And link it up from the original place */
2081 r = symlink_idempotent(relative, p);
2082 if (r < 0)
2083 goto fail;
2084
2085 /* Lock down the access mode */
2086 if (chmod(pp, context->directories[type].mode) < 0) {
2087 r = -errno;
2088 goto fail;
2089 }
2090 } else {
2091 r = mkdir_label(p, context->directories[type].mode);
2092 if (r == -EEXIST)
2093 continue;
2094 if (r < 0)
2095 goto fail;
2096 }
2097
2098 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2099 * a service, and shall not be writable. */
2100 if (type == EXEC_DIRECTORY_CONFIGURATION)
2101 continue;
2102
2103 /* Then, change the ownership of the whole tree, if necessary */
2104 r = path_chown_recursive(pp ?: p, uid, gid);
2105 if (r < 0)
2106 goto fail;
2107 }
2108
2109 return 0;
2110
2111 fail:
2112 *exit_status = exit_status_table[type];
2113 return r;
2114 }
2115
2116 #if ENABLE_SMACK
2117 static int setup_smack(
2118 const ExecContext *context,
2119 const ExecCommand *command) {
2120
2121 int r;
2122
2123 assert(context);
2124 assert(command);
2125
2126 if (context->smack_process_label) {
2127 r = mac_smack_apply_pid(0, context->smack_process_label);
2128 if (r < 0)
2129 return r;
2130 }
2131 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2132 else {
2133 _cleanup_free_ char *exec_label = NULL;
2134
2135 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2136 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2137 return r;
2138
2139 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2140 if (r < 0)
2141 return r;
2142 }
2143 #endif
2144
2145 return 0;
2146 }
2147 #endif
2148
2149 static int compile_bind_mounts(
2150 const ExecContext *context,
2151 const ExecParameters *params,
2152 BindMount **ret_bind_mounts,
2153 size_t *ret_n_bind_mounts,
2154 char ***ret_empty_directories) {
2155
2156 _cleanup_strv_free_ char **empty_directories = NULL;
2157 BindMount *bind_mounts;
2158 size_t n, h = 0, i;
2159 ExecDirectoryType t;
2160 int r;
2161
2162 assert(context);
2163 assert(params);
2164 assert(ret_bind_mounts);
2165 assert(ret_n_bind_mounts);
2166 assert(ret_empty_directories);
2167
2168 n = context->n_bind_mounts;
2169 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2170 if (!params->prefix[t])
2171 continue;
2172
2173 n += strv_length(context->directories[t].paths);
2174 }
2175
2176 if (n <= 0) {
2177 *ret_bind_mounts = NULL;
2178 *ret_n_bind_mounts = 0;
2179 *ret_empty_directories = NULL;
2180 return 0;
2181 }
2182
2183 bind_mounts = new(BindMount, n);
2184 if (!bind_mounts)
2185 return -ENOMEM;
2186
2187 for (i = 0; i < context->n_bind_mounts; i++) {
2188 BindMount *item = context->bind_mounts + i;
2189 char *s, *d;
2190
2191 s = strdup(item->source);
2192 if (!s) {
2193 r = -ENOMEM;
2194 goto finish;
2195 }
2196
2197 d = strdup(item->destination);
2198 if (!d) {
2199 free(s);
2200 r = -ENOMEM;
2201 goto finish;
2202 }
2203
2204 bind_mounts[h++] = (BindMount) {
2205 .source = s,
2206 .destination = d,
2207 .read_only = item->read_only,
2208 .recursive = item->recursive,
2209 .ignore_enoent = item->ignore_enoent,
2210 };
2211 }
2212
2213 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2214 char **suffix;
2215
2216 if (!params->prefix[t])
2217 continue;
2218
2219 if (strv_isempty(context->directories[t].paths))
2220 continue;
2221
2222 if (context->dynamic_user &&
2223 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2224 char *private_root;
2225
2226 /* So this is for a dynamic user, and we need to make sure the process can access its own
2227 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2228 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2229
2230 private_root = strjoin(params->prefix[t], "/private");
2231 if (!private_root) {
2232 r = -ENOMEM;
2233 goto finish;
2234 }
2235
2236 r = strv_consume(&empty_directories, private_root);
2237 if (r < 0)
2238 goto finish;
2239 }
2240
2241 STRV_FOREACH(suffix, context->directories[t].paths) {
2242 char *s, *d;
2243
2244 if (context->dynamic_user &&
2245 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2246 s = strjoin(params->prefix[t], "/private/", *suffix);
2247 else
2248 s = strjoin(params->prefix[t], "/", *suffix);
2249 if (!s) {
2250 r = -ENOMEM;
2251 goto finish;
2252 }
2253
2254 d = strdup(s);
2255 if (!d) {
2256 free(s);
2257 r = -ENOMEM;
2258 goto finish;
2259 }
2260
2261 bind_mounts[h++] = (BindMount) {
2262 .source = s,
2263 .destination = d,
2264 .read_only = false,
2265 .recursive = true,
2266 .ignore_enoent = false,
2267 };
2268 }
2269 }
2270
2271 assert(h == n);
2272
2273 *ret_bind_mounts = bind_mounts;
2274 *ret_n_bind_mounts = n;
2275 *ret_empty_directories = TAKE_PTR(empty_directories);
2276
2277 return (int) n;
2278
2279 finish:
2280 bind_mount_free_many(bind_mounts, h);
2281 return r;
2282 }
2283
2284 static int apply_mount_namespace(
2285 const Unit *u,
2286 const ExecCommand *command,
2287 const ExecContext *context,
2288 const ExecParameters *params,
2289 const ExecRuntime *runtime) {
2290
2291 _cleanup_strv_free_ char **empty_directories = NULL;
2292 char *tmp = NULL, *var = NULL;
2293 const char *root_dir = NULL, *root_image = NULL;
2294 NamespaceInfo ns_info = {};
2295 bool needs_sandboxing;
2296 BindMount *bind_mounts = NULL;
2297 size_t n_bind_mounts = 0;
2298 int r;
2299
2300 assert(context);
2301
2302 /* The runtime struct only contains the parent of the private /tmp,
2303 * which is non-accessible to world users. Inside of it there's a /tmp
2304 * that is sticky, and that's the one we want to use here. */
2305
2306 if (context->private_tmp && runtime) {
2307 if (runtime->tmp_dir)
2308 tmp = strjoina(runtime->tmp_dir, "/tmp");
2309 if (runtime->var_tmp_dir)
2310 var = strjoina(runtime->var_tmp_dir, "/tmp");
2311 }
2312
2313 if (params->flags & EXEC_APPLY_CHROOT) {
2314 root_image = context->root_image;
2315
2316 if (!root_image)
2317 root_dir = context->root_directory;
2318 }
2319
2320 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2321 if (r < 0)
2322 return r;
2323
2324 /*
2325 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2326 * sandbox info, otherwise enforce it, don't ignore protected paths and
2327 * fail if we are enable to apply the sandbox inside the mount namespace.
2328 */
2329 if (!context->dynamic_user && root_dir)
2330 ns_info.ignore_protect_paths = true;
2331
2332 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2333
2334 if (needs_sandboxing)
2335 ns_info = (NamespaceInfo) {
2336 .ignore_protect_paths = false,
2337 .private_dev = context->private_devices,
2338 .protect_control_groups = context->protect_control_groups,
2339 .protect_kernel_tunables = context->protect_kernel_tunables,
2340 .protect_kernel_modules = context->protect_kernel_modules,
2341 .mount_apivfs = context->mount_apivfs,
2342 };
2343
2344 r = setup_namespace(root_dir, root_image,
2345 &ns_info, context->read_write_paths,
2346 needs_sandboxing ? context->read_only_paths : NULL,
2347 needs_sandboxing ? context->inaccessible_paths : NULL,
2348 empty_directories,
2349 bind_mounts,
2350 n_bind_mounts,
2351 context->temporary_filesystems,
2352 context->n_temporary_filesystems,
2353 tmp,
2354 var,
2355 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2356 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2357 context->mount_flags,
2358 DISSECT_IMAGE_DISCARD_ON_LOOP);
2359
2360 bind_mount_free_many(bind_mounts, n_bind_mounts);
2361
2362 /* If we couldn't set up the namespace this is probably due to a
2363 * missing capability. In this case, silently proceeed. */
2364 if (IN_SET(r, -EPERM, -EACCES)) {
2365 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2366 return 0;
2367 }
2368
2369 return r;
2370 }
2371
2372 static int apply_working_directory(
2373 const ExecContext *context,
2374 const ExecParameters *params,
2375 const char *home,
2376 const bool needs_mount_ns,
2377 int *exit_status) {
2378
2379 const char *d, *wd;
2380
2381 assert(context);
2382 assert(exit_status);
2383
2384 if (context->working_directory_home) {
2385
2386 if (!home) {
2387 *exit_status = EXIT_CHDIR;
2388 return -ENXIO;
2389 }
2390
2391 wd = home;
2392
2393 } else if (context->working_directory)
2394 wd = context->working_directory;
2395 else
2396 wd = "/";
2397
2398 if (params->flags & EXEC_APPLY_CHROOT) {
2399 if (!needs_mount_ns && context->root_directory)
2400 if (chroot(context->root_directory) < 0) {
2401 *exit_status = EXIT_CHROOT;
2402 return -errno;
2403 }
2404
2405 d = wd;
2406 } else
2407 d = prefix_roota(context->root_directory, wd);
2408
2409 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2410 *exit_status = EXIT_CHDIR;
2411 return -errno;
2412 }
2413
2414 return 0;
2415 }
2416
2417 static int setup_keyring(
2418 const Unit *u,
2419 const ExecContext *context,
2420 const ExecParameters *p,
2421 uid_t uid, gid_t gid) {
2422
2423 key_serial_t keyring;
2424 int r = 0;
2425 uid_t saved_uid;
2426 gid_t saved_gid;
2427
2428 assert(u);
2429 assert(context);
2430 assert(p);
2431
2432 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2433 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2434 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2435 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2436 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2437 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2438
2439 if (!(p->flags & EXEC_NEW_KEYRING))
2440 return 0;
2441
2442 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2443 return 0;
2444
2445 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2446 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2447 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2448 * & group is just as nasty as acquiring a reference to the user keyring. */
2449
2450 saved_uid = getuid();
2451 saved_gid = getgid();
2452
2453 if (gid_is_valid(gid) && gid != saved_gid) {
2454 if (setregid(gid, -1) < 0)
2455 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2456 }
2457
2458 if (uid_is_valid(uid) && uid != saved_uid) {
2459 if (setreuid(uid, -1) < 0) {
2460 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2461 goto out;
2462 }
2463 }
2464
2465 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2466 if (keyring == -1) {
2467 if (errno == ENOSYS)
2468 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2469 else if (IN_SET(errno, EACCES, EPERM))
2470 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2471 else if (errno == EDQUOT)
2472 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2473 else
2474 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2475
2476 goto out;
2477 }
2478
2479 /* When requested link the user keyring into the session keyring. */
2480 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2481
2482 if (keyctl(KEYCTL_LINK,
2483 KEY_SPEC_USER_KEYRING,
2484 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2485 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2486 goto out;
2487 }
2488 }
2489
2490 /* Restore uid/gid back */
2491 if (uid_is_valid(uid) && uid != saved_uid) {
2492 if (setreuid(saved_uid, -1) < 0) {
2493 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2494 goto out;
2495 }
2496 }
2497
2498 if (gid_is_valid(gid) && gid != saved_gid) {
2499 if (setregid(saved_gid, -1) < 0)
2500 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2501 }
2502
2503 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2504 if (!sd_id128_is_null(u->invocation_id)) {
2505 key_serial_t key;
2506
2507 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2508 if (key == -1)
2509 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2510 else {
2511 if (keyctl(KEYCTL_SETPERM, key,
2512 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2513 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2514 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2515 }
2516 }
2517
2518 out:
2519 /* Revert back uid & gid for the the last time, and exit */
2520 /* no extra logging, as only the first already reported error matters */
2521 if (getuid() != saved_uid)
2522 (void) setreuid(saved_uid, -1);
2523
2524 if (getgid() != saved_gid)
2525 (void) setregid(saved_gid, -1);
2526
2527 return r;
2528 }
2529
2530 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2531 assert(array);
2532 assert(n);
2533
2534 if (!pair)
2535 return;
2536
2537 if (pair[0] >= 0)
2538 array[(*n)++] = pair[0];
2539 if (pair[1] >= 0)
2540 array[(*n)++] = pair[1];
2541 }
2542
2543 static int close_remaining_fds(
2544 const ExecParameters *params,
2545 const ExecRuntime *runtime,
2546 const DynamicCreds *dcreds,
2547 int user_lookup_fd,
2548 int socket_fd,
2549 int *fds, size_t n_fds) {
2550
2551 size_t n_dont_close = 0;
2552 int dont_close[n_fds + 12];
2553
2554 assert(params);
2555
2556 if (params->stdin_fd >= 0)
2557 dont_close[n_dont_close++] = params->stdin_fd;
2558 if (params->stdout_fd >= 0)
2559 dont_close[n_dont_close++] = params->stdout_fd;
2560 if (params->stderr_fd >= 0)
2561 dont_close[n_dont_close++] = params->stderr_fd;
2562
2563 if (socket_fd >= 0)
2564 dont_close[n_dont_close++] = socket_fd;
2565 if (n_fds > 0) {
2566 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2567 n_dont_close += n_fds;
2568 }
2569
2570 if (runtime)
2571 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2572
2573 if (dcreds) {
2574 if (dcreds->user)
2575 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2576 if (dcreds->group)
2577 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2578 }
2579
2580 if (user_lookup_fd >= 0)
2581 dont_close[n_dont_close++] = user_lookup_fd;
2582
2583 return close_all_fds(dont_close, n_dont_close);
2584 }
2585
2586 static int send_user_lookup(
2587 Unit *unit,
2588 int user_lookup_fd,
2589 uid_t uid,
2590 gid_t gid) {
2591
2592 assert(unit);
2593
2594 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2595 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2596 * specified. */
2597
2598 if (user_lookup_fd < 0)
2599 return 0;
2600
2601 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2602 return 0;
2603
2604 if (writev(user_lookup_fd,
2605 (struct iovec[]) {
2606 IOVEC_INIT(&uid, sizeof(uid)),
2607 IOVEC_INIT(&gid, sizeof(gid)),
2608 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2609 return -errno;
2610
2611 return 0;
2612 }
2613
2614 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2615 int r;
2616
2617 assert(c);
2618 assert(home);
2619 assert(buf);
2620
2621 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2622
2623 if (*home)
2624 return 0;
2625
2626 if (!c->working_directory_home)
2627 return 0;
2628
2629 if (uid == 0) {
2630 /* Hardcode /root as home directory for UID 0 */
2631 *home = "/root";
2632 return 1;
2633 }
2634
2635 r = get_home_dir(buf);
2636 if (r < 0)
2637 return r;
2638
2639 *home = *buf;
2640 return 1;
2641 }
2642
2643 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2644 _cleanup_strv_free_ char ** list = NULL;
2645 ExecDirectoryType t;
2646 int r;
2647
2648 assert(c);
2649 assert(p);
2650 assert(ret);
2651
2652 assert(c->dynamic_user);
2653
2654 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2655 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2656 * directories. */
2657
2658 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2659 char **i;
2660
2661 if (t == EXEC_DIRECTORY_CONFIGURATION)
2662 continue;
2663
2664 if (!p->prefix[t])
2665 continue;
2666
2667 STRV_FOREACH(i, c->directories[t].paths) {
2668 char *e;
2669
2670 if (t == EXEC_DIRECTORY_RUNTIME)
2671 e = strjoin(p->prefix[t], "/", *i);
2672 else
2673 e = strjoin(p->prefix[t], "/private/", *i);
2674 if (!e)
2675 return -ENOMEM;
2676
2677 r = strv_consume(&list, e);
2678 if (r < 0)
2679 return r;
2680 }
2681 }
2682
2683 *ret = TAKE_PTR(list);
2684
2685 return 0;
2686 }
2687
2688 static char *exec_command_line(char **argv);
2689
2690 static int exec_child(
2691 Unit *unit,
2692 const ExecCommand *command,
2693 const ExecContext *context,
2694 const ExecParameters *params,
2695 ExecRuntime *runtime,
2696 DynamicCreds *dcreds,
2697 char **argv,
2698 int socket_fd,
2699 int named_iofds[3],
2700 int *fds,
2701 size_t n_storage_fds,
2702 size_t n_socket_fds,
2703 char **files_env,
2704 int user_lookup_fd,
2705 int *exit_status) {
2706
2707 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2708 _cleanup_free_ char *home_buffer = NULL;
2709 _cleanup_free_ gid_t *supplementary_gids = NULL;
2710 const char *username = NULL, *groupname = NULL;
2711 const char *home = NULL, *shell = NULL;
2712 dev_t journal_stream_dev = 0;
2713 ino_t journal_stream_ino = 0;
2714 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2715 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2716 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2717 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2718 #if HAVE_SELINUX
2719 _cleanup_free_ char *mac_selinux_context_net = NULL;
2720 bool use_selinux = false;
2721 #endif
2722 #if ENABLE_SMACK
2723 bool use_smack = false;
2724 #endif
2725 #if HAVE_APPARMOR
2726 bool use_apparmor = false;
2727 #endif
2728 uid_t uid = UID_INVALID;
2729 gid_t gid = GID_INVALID;
2730 int i, r, ngids = 0;
2731 size_t n_fds;
2732 ExecDirectoryType dt;
2733 int secure_bits;
2734
2735 assert(unit);
2736 assert(command);
2737 assert(context);
2738 assert(params);
2739 assert(exit_status);
2740
2741 rename_process_from_path(command->path);
2742
2743 /* We reset exactly these signals, since they are the
2744 * only ones we set to SIG_IGN in the main daemon. All
2745 * others we leave untouched because we set them to
2746 * SIG_DFL or a valid handler initially, both of which
2747 * will be demoted to SIG_DFL. */
2748 (void) default_signals(SIGNALS_CRASH_HANDLER,
2749 SIGNALS_IGNORE, -1);
2750
2751 if (context->ignore_sigpipe)
2752 (void) ignore_signals(SIGPIPE, -1);
2753
2754 r = reset_signal_mask();
2755 if (r < 0) {
2756 *exit_status = EXIT_SIGNAL_MASK;
2757 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2758 }
2759
2760 if (params->idle_pipe)
2761 do_idle_pipe_dance(params->idle_pipe);
2762
2763 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2764 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2765 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2766 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2767
2768 log_forget_fds();
2769 log_set_open_when_needed(true);
2770
2771 /* In case anything used libc syslog(), close this here, too */
2772 closelog();
2773
2774 n_fds = n_storage_fds + n_socket_fds;
2775 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2776 if (r < 0) {
2777 *exit_status = EXIT_FDS;
2778 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2779 }
2780
2781 if (!context->same_pgrp)
2782 if (setsid() < 0) {
2783 *exit_status = EXIT_SETSID;
2784 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2785 }
2786
2787 exec_context_tty_reset(context, params);
2788
2789 if (unit_shall_confirm_spawn(unit)) {
2790 const char *vc = params->confirm_spawn;
2791 _cleanup_free_ char *cmdline = NULL;
2792
2793 cmdline = exec_command_line(argv);
2794 if (!cmdline) {
2795 *exit_status = EXIT_MEMORY;
2796 return log_oom();
2797 }
2798
2799 r = ask_for_confirmation(vc, unit, cmdline);
2800 if (r != CONFIRM_EXECUTE) {
2801 if (r == CONFIRM_PRETEND_SUCCESS) {
2802 *exit_status = EXIT_SUCCESS;
2803 return 0;
2804 }
2805 *exit_status = EXIT_CONFIRM;
2806 log_unit_error(unit, "Execution cancelled by the user");
2807 return -ECANCELED;
2808 }
2809 }
2810
2811 if (context->dynamic_user && dcreds) {
2812 _cleanup_strv_free_ char **suggested_paths = NULL;
2813
2814 /* Make sure we bypass our own NSS module for any NSS checks */
2815 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2816 *exit_status = EXIT_USER;
2817 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2818 }
2819
2820 r = compile_suggested_paths(context, params, &suggested_paths);
2821 if (r < 0) {
2822 *exit_status = EXIT_MEMORY;
2823 return log_oom();
2824 }
2825
2826 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2827 if (r < 0) {
2828 *exit_status = EXIT_USER;
2829 if (r == -EILSEQ) {
2830 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2831 return -EOPNOTSUPP;
2832 }
2833 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2834 }
2835
2836 if (!uid_is_valid(uid)) {
2837 *exit_status = EXIT_USER;
2838 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2839 return -ESRCH;
2840 }
2841
2842 if (!gid_is_valid(gid)) {
2843 *exit_status = EXIT_USER;
2844 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2845 return -ESRCH;
2846 }
2847
2848 if (dcreds->user)
2849 username = dcreds->user->name;
2850
2851 } else {
2852 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2853 if (r < 0) {
2854 *exit_status = EXIT_USER;
2855 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2856 }
2857
2858 r = get_fixed_group(context, &groupname, &gid);
2859 if (r < 0) {
2860 *exit_status = EXIT_GROUP;
2861 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2862 }
2863 }
2864
2865 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2866 r = get_supplementary_groups(context, username, groupname, gid,
2867 &supplementary_gids, &ngids);
2868 if (r < 0) {
2869 *exit_status = EXIT_GROUP;
2870 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2871 }
2872
2873 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2874 if (r < 0) {
2875 *exit_status = EXIT_USER;
2876 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2877 }
2878
2879 user_lookup_fd = safe_close(user_lookup_fd);
2880
2881 r = acquire_home(context, uid, &home, &home_buffer);
2882 if (r < 0) {
2883 *exit_status = EXIT_CHDIR;
2884 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2885 }
2886
2887 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2888 * must sure to drop O_NONBLOCK */
2889 if (socket_fd >= 0)
2890 (void) fd_nonblock(socket_fd, false);
2891
2892 r = setup_input(context, params, socket_fd, named_iofds);
2893 if (r < 0) {
2894 *exit_status = EXIT_STDIN;
2895 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2896 }
2897
2898 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2899 if (r < 0) {
2900 *exit_status = EXIT_STDOUT;
2901 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2902 }
2903
2904 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2905 if (r < 0) {
2906 *exit_status = EXIT_STDERR;
2907 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2908 }
2909
2910 if (params->cgroup_path) {
2911 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2912 if (r < 0) {
2913 *exit_status = EXIT_CGROUP;
2914 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2915 }
2916 }
2917
2918 if (context->oom_score_adjust_set) {
2919 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2920
2921 /* When we can't make this change due to EPERM, then
2922 * let's silently skip over it. User namespaces
2923 * prohibit write access to this file, and we
2924 * shouldn't trip up over that. */
2925
2926 sprintf(t, "%i", context->oom_score_adjust);
2927 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2928 if (IN_SET(r, -EPERM, -EACCES))
2929 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2930 else if (r < 0) {
2931 *exit_status = EXIT_OOM_ADJUST;
2932 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2933 }
2934 }
2935
2936 if (context->nice_set)
2937 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2938 *exit_status = EXIT_NICE;
2939 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2940 }
2941
2942 if (context->cpu_sched_set) {
2943 struct sched_param param = {
2944 .sched_priority = context->cpu_sched_priority,
2945 };
2946
2947 r = sched_setscheduler(0,
2948 context->cpu_sched_policy |
2949 (context->cpu_sched_reset_on_fork ?
2950 SCHED_RESET_ON_FORK : 0),
2951 &param);
2952 if (r < 0) {
2953 *exit_status = EXIT_SETSCHEDULER;
2954 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2955 }
2956 }
2957
2958 if (context->cpuset)
2959 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2960 *exit_status = EXIT_CPUAFFINITY;
2961 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2962 }
2963
2964 if (context->ioprio_set)
2965 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2966 *exit_status = EXIT_IOPRIO;
2967 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2968 }
2969
2970 if (context->timer_slack_nsec != NSEC_INFINITY)
2971 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2972 *exit_status = EXIT_TIMERSLACK;
2973 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2974 }
2975
2976 if (context->personality != PERSONALITY_INVALID) {
2977 r = safe_personality(context->personality);
2978 if (r < 0) {
2979 *exit_status = EXIT_PERSONALITY;
2980 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2981 }
2982 }
2983
2984 if (context->utmp_id)
2985 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2986 context->tty_path,
2987 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
2988 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2989 USER_PROCESS,
2990 username);
2991
2992 if (context->user) {
2993 r = chown_terminal(STDIN_FILENO, uid);
2994 if (r < 0) {
2995 *exit_status = EXIT_STDIN;
2996 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2997 }
2998 }
2999
3000 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3001 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3002 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3003 * touch a single hierarchy too. */
3004 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3005 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3006 if (r < 0) {
3007 *exit_status = EXIT_CGROUP;
3008 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3009 }
3010 }
3011
3012 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3013 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3014 if (r < 0)
3015 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3016 }
3017
3018 r = build_environment(
3019 unit,
3020 context,
3021 params,
3022 n_fds,
3023 home,
3024 username,
3025 shell,
3026 journal_stream_dev,
3027 journal_stream_ino,
3028 &our_env);
3029 if (r < 0) {
3030 *exit_status = EXIT_MEMORY;
3031 return log_oom();
3032 }
3033
3034 r = build_pass_environment(context, &pass_env);
3035 if (r < 0) {
3036 *exit_status = EXIT_MEMORY;
3037 return log_oom();
3038 }
3039
3040 accum_env = strv_env_merge(5,
3041 params->environment,
3042 our_env,
3043 pass_env,
3044 context->environment,
3045 files_env,
3046 NULL);
3047 if (!accum_env) {
3048 *exit_status = EXIT_MEMORY;
3049 return log_oom();
3050 }
3051 accum_env = strv_env_clean(accum_env);
3052
3053 (void) umask(context->umask);
3054
3055 r = setup_keyring(unit, context, params, uid, gid);
3056 if (r < 0) {
3057 *exit_status = EXIT_KEYRING;
3058 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3059 }
3060
3061 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3062 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3063
3064 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3065 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3066
3067 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3068 if (needs_ambient_hack)
3069 needs_setuid = false;
3070 else
3071 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3072
3073 if (needs_sandboxing) {
3074 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3075 * present. The actual MAC context application will happen later, as late as possible, to avoid
3076 * impacting our own code paths. */
3077
3078 #if HAVE_SELINUX
3079 use_selinux = mac_selinux_use();
3080 #endif
3081 #if ENABLE_SMACK
3082 use_smack = mac_smack_use();
3083 #endif
3084 #if HAVE_APPARMOR
3085 use_apparmor = mac_apparmor_use();
3086 #endif
3087 }
3088
3089 if (needs_setuid) {
3090 if (context->pam_name && username) {
3091 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3092 if (r < 0) {
3093 *exit_status = EXIT_PAM;
3094 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3095 }
3096 }
3097 }
3098
3099 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3100 if (ns_type_supported(NAMESPACE_NET)) {
3101 r = setup_netns(runtime->netns_storage_socket);
3102 if (r < 0) {
3103 *exit_status = EXIT_NETWORK;
3104 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3105 }
3106 } else
3107 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3108 }
3109
3110 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3111 if (needs_mount_namespace) {
3112 r = apply_mount_namespace(unit, command, context, params, runtime);
3113 if (r < 0) {
3114 *exit_status = EXIT_NAMESPACE;
3115 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3116 }
3117 }
3118
3119 /* Apply just after mount namespace setup */
3120 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3121 if (r < 0)
3122 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3123
3124 /* Drop groups as early as possbile */
3125 if (needs_setuid) {
3126 r = enforce_groups(gid, supplementary_gids, ngids);
3127 if (r < 0) {
3128 *exit_status = EXIT_GROUP;
3129 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3130 }
3131 }
3132
3133 if (needs_sandboxing) {
3134 #if HAVE_SELINUX
3135 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3136 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3137 if (r < 0) {
3138 *exit_status = EXIT_SELINUX_CONTEXT;
3139 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3140 }
3141 }
3142 #endif
3143
3144 if (context->private_users) {
3145 r = setup_private_users(uid, gid);
3146 if (r < 0) {
3147 *exit_status = EXIT_USER;
3148 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3149 }
3150 }
3151 }
3152
3153 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3154 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3155 * was needed to upload the policy and can now be closed as well. */
3156 r = close_all_fds(fds, n_fds);
3157 if (r >= 0)
3158 r = shift_fds(fds, n_fds);
3159 if (r >= 0)
3160 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3161 if (r < 0) {
3162 *exit_status = EXIT_FDS;
3163 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3164 }
3165
3166 secure_bits = context->secure_bits;
3167
3168 if (needs_sandboxing) {
3169 uint64_t bset;
3170
3171 for (i = 0; i < _RLIMIT_MAX; i++) {
3172
3173 if (!context->rlimit[i])
3174 continue;
3175
3176 r = setrlimit_closest(i, context->rlimit[i]);
3177 if (r < 0) {
3178 *exit_status = EXIT_LIMITS;
3179 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(i));
3180 }
3181 }
3182
3183 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3184 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3185 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3186 *exit_status = EXIT_LIMITS;
3187 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3188 }
3189 }
3190
3191 #if ENABLE_SMACK
3192 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3193 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3194 if (use_smack) {
3195 r = setup_smack(context, command);
3196 if (r < 0) {
3197 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3198 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3199 }
3200 }
3201 #endif
3202
3203 bset = context->capability_bounding_set;
3204 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3205 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3206 * instead of us doing that */
3207 if (needs_ambient_hack)
3208 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3209 (UINT64_C(1) << CAP_SETUID) |
3210 (UINT64_C(1) << CAP_SETGID);
3211
3212 if (!cap_test_all(bset)) {
3213 r = capability_bounding_set_drop(bset, false);
3214 if (r < 0) {
3215 *exit_status = EXIT_CAPABILITIES;
3216 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3217 }
3218 }
3219
3220 /* This is done before enforce_user, but ambient set
3221 * does not survive over setresuid() if keep_caps is not set. */
3222 if (!needs_ambient_hack &&
3223 context->capability_ambient_set != 0) {
3224 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3225 if (r < 0) {
3226 *exit_status = EXIT_CAPABILITIES;
3227 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3228 }
3229 }
3230 }
3231
3232 if (needs_setuid) {
3233 if (context->user) {
3234 r = enforce_user(context, uid);
3235 if (r < 0) {
3236 *exit_status = EXIT_USER;
3237 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3238 }
3239
3240 if (!needs_ambient_hack &&
3241 context->capability_ambient_set != 0) {
3242
3243 /* Fix the ambient capabilities after user change. */
3244 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3245 if (r < 0) {
3246 *exit_status = EXIT_CAPABILITIES;
3247 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3248 }
3249
3250 /* If we were asked to change user and ambient capabilities
3251 * were requested, we had to add keep-caps to the securebits
3252 * so that we would maintain the inherited capability set
3253 * through the setresuid(). Make sure that the bit is added
3254 * also to the context secure_bits so that we don't try to
3255 * drop the bit away next. */
3256
3257 secure_bits |= 1<<SECURE_KEEP_CAPS;
3258 }
3259 }
3260 }
3261
3262 if (needs_sandboxing) {
3263 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3264 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3265 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3266 * are restricted. */
3267
3268 #if HAVE_SELINUX
3269 if (use_selinux) {
3270 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3271
3272 if (exec_context) {
3273 r = setexeccon(exec_context);
3274 if (r < 0) {
3275 *exit_status = EXIT_SELINUX_CONTEXT;
3276 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3277 }
3278 }
3279 }
3280 #endif
3281
3282 #if HAVE_APPARMOR
3283 if (use_apparmor && context->apparmor_profile) {
3284 r = aa_change_onexec(context->apparmor_profile);
3285 if (r < 0 && !context->apparmor_profile_ignore) {
3286 *exit_status = EXIT_APPARMOR_PROFILE;
3287 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3288 }
3289 }
3290 #endif
3291
3292 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3293 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3294 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3295 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3296 *exit_status = EXIT_SECUREBITS;
3297 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3298 }
3299
3300 if (context_has_no_new_privileges(context))
3301 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3302 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3303 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3304 }
3305
3306 #if HAVE_SECCOMP
3307 r = apply_address_families(unit, context);
3308 if (r < 0) {
3309 *exit_status = EXIT_ADDRESS_FAMILIES;
3310 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3311 }
3312
3313 r = apply_memory_deny_write_execute(unit, context);
3314 if (r < 0) {
3315 *exit_status = EXIT_SECCOMP;
3316 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3317 }
3318
3319 r = apply_restrict_realtime(unit, context);
3320 if (r < 0) {
3321 *exit_status = EXIT_SECCOMP;
3322 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3323 }
3324
3325 r = apply_restrict_namespaces(unit, context);
3326 if (r < 0) {
3327 *exit_status = EXIT_SECCOMP;
3328 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3329 }
3330
3331 r = apply_protect_sysctl(unit, context);
3332 if (r < 0) {
3333 *exit_status = EXIT_SECCOMP;
3334 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3335 }
3336
3337 r = apply_protect_kernel_modules(unit, context);
3338 if (r < 0) {
3339 *exit_status = EXIT_SECCOMP;
3340 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3341 }
3342
3343 r = apply_private_devices(unit, context);
3344 if (r < 0) {
3345 *exit_status = EXIT_SECCOMP;
3346 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3347 }
3348
3349 r = apply_syscall_archs(unit, context);
3350 if (r < 0) {
3351 *exit_status = EXIT_SECCOMP;
3352 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3353 }
3354
3355 r = apply_lock_personality(unit, context);
3356 if (r < 0) {
3357 *exit_status = EXIT_SECCOMP;
3358 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3359 }
3360
3361 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3362 * by the filter as little as possible. */
3363 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3364 if (r < 0) {
3365 *exit_status = EXIT_SECCOMP;
3366 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3367 }
3368 #endif
3369 }
3370
3371 if (!strv_isempty(context->unset_environment)) {
3372 char **ee = NULL;
3373
3374 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3375 if (!ee) {
3376 *exit_status = EXIT_MEMORY;
3377 return log_oom();
3378 }
3379
3380 strv_free_and_replace(accum_env, ee);
3381 }
3382
3383 final_argv = replace_env_argv(argv, accum_env);
3384 if (!final_argv) {
3385 *exit_status = EXIT_MEMORY;
3386 return log_oom();
3387 }
3388
3389 if (DEBUG_LOGGING) {
3390 _cleanup_free_ char *line;
3391
3392 line = exec_command_line(final_argv);
3393 if (line) {
3394 log_struct(LOG_DEBUG,
3395 "EXECUTABLE=%s", command->path,
3396 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3397 LOG_UNIT_ID(unit),
3398 LOG_UNIT_INVOCATION_ID(unit),
3399 NULL);
3400 }
3401 }
3402
3403 execve(command->path, final_argv, accum_env);
3404
3405 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3406
3407 log_struct_errno(LOG_INFO, errno,
3408 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3409 LOG_UNIT_ID(unit),
3410 LOG_UNIT_INVOCATION_ID(unit),
3411 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3412 command->path),
3413 "EXECUTABLE=%s", command->path,
3414 NULL);
3415
3416 return 0;
3417 }
3418
3419 *exit_status = EXIT_EXEC;
3420 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3421 }
3422
3423 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3424 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3425
3426 int exec_spawn(Unit *unit,
3427 ExecCommand *command,
3428 const ExecContext *context,
3429 const ExecParameters *params,
3430 ExecRuntime *runtime,
3431 DynamicCreds *dcreds,
3432 pid_t *ret) {
3433
3434 _cleanup_strv_free_ char **files_env = NULL;
3435 int *fds = NULL;
3436 size_t n_storage_fds = 0, n_socket_fds = 0;
3437 _cleanup_free_ char *line = NULL;
3438 int socket_fd, r;
3439 int named_iofds[3] = { -1, -1, -1 };
3440 char **argv;
3441 pid_t pid;
3442
3443 assert(unit);
3444 assert(command);
3445 assert(context);
3446 assert(ret);
3447 assert(params);
3448 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3449
3450 if (context->std_input == EXEC_INPUT_SOCKET ||
3451 context->std_output == EXEC_OUTPUT_SOCKET ||
3452 context->std_error == EXEC_OUTPUT_SOCKET) {
3453
3454 if (params->n_socket_fds > 1) {
3455 log_unit_error(unit, "Got more than one socket.");
3456 return -EINVAL;
3457 }
3458
3459 if (params->n_socket_fds == 0) {
3460 log_unit_error(unit, "Got no socket.");
3461 return -EINVAL;
3462 }
3463
3464 socket_fd = params->fds[0];
3465 } else {
3466 socket_fd = -1;
3467 fds = params->fds;
3468 n_storage_fds = params->n_storage_fds;
3469 n_socket_fds = params->n_socket_fds;
3470 }
3471
3472 r = exec_context_named_iofds(context, params, named_iofds);
3473 if (r < 0)
3474 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3475
3476 r = exec_context_load_environment(unit, context, &files_env);
3477 if (r < 0)
3478 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3479
3480 argv = params->argv ?: command->argv;
3481 line = exec_command_line(argv);
3482 if (!line)
3483 return log_oom();
3484
3485 log_struct(LOG_DEBUG,
3486 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3487 "EXECUTABLE=%s", command->path,
3488 LOG_UNIT_ID(unit),
3489 LOG_UNIT_INVOCATION_ID(unit),
3490 NULL);
3491
3492 pid = fork();
3493 if (pid < 0)
3494 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3495
3496 if (pid == 0) {
3497 int exit_status = EXIT_SUCCESS;
3498
3499 r = exec_child(unit,
3500 command,
3501 context,
3502 params,
3503 runtime,
3504 dcreds,
3505 argv,
3506 socket_fd,
3507 named_iofds,
3508 fds,
3509 n_storage_fds,
3510 n_socket_fds,
3511 files_env,
3512 unit->manager->user_lookup_fds[1],
3513 &exit_status);
3514
3515 if (r < 0) {
3516 log_struct_errno(LOG_ERR, r,
3517 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3518 LOG_UNIT_ID(unit),
3519 LOG_UNIT_INVOCATION_ID(unit),
3520 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3521 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3522 command->path),
3523 "EXECUTABLE=%s", command->path,
3524 NULL);
3525 }
3526
3527 _exit(exit_status);
3528 }
3529
3530 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3531
3532 /* We add the new process to the cgroup both in the child (so
3533 * that we can be sure that no user code is ever executed
3534 * outside of the cgroup) and in the parent (so that we can be
3535 * sure that when we kill the cgroup the process will be
3536 * killed too). */
3537 if (params->cgroup_path)
3538 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3539
3540 exec_status_start(&command->exec_status, pid);
3541
3542 *ret = pid;
3543 return 0;
3544 }
3545
3546 void exec_context_init(ExecContext *c) {
3547 ExecDirectoryType i;
3548
3549 assert(c);
3550
3551 c->umask = 0022;
3552 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3553 c->cpu_sched_policy = SCHED_OTHER;
3554 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3555 c->syslog_level_prefix = true;
3556 c->ignore_sigpipe = true;
3557 c->timer_slack_nsec = NSEC_INFINITY;
3558 c->personality = PERSONALITY_INVALID;
3559 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3560 c->directories[i].mode = 0755;
3561 c->capability_bounding_set = CAP_ALL;
3562 c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3563 c->log_level_max = -1;
3564 }
3565
3566 void exec_context_done(ExecContext *c) {
3567 ExecDirectoryType i;
3568 size_t l;
3569
3570 assert(c);
3571
3572 c->environment = strv_free(c->environment);
3573 c->environment_files = strv_free(c->environment_files);
3574 c->pass_environment = strv_free(c->pass_environment);
3575 c->unset_environment = strv_free(c->unset_environment);
3576
3577 for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3578 c->rlimit[l] = mfree(c->rlimit[l]);
3579
3580 for (l = 0; l < 3; l++) {
3581 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3582 c->stdio_file[l] = mfree(c->stdio_file[l]);
3583 }
3584
3585 c->working_directory = mfree(c->working_directory);
3586 c->root_directory = mfree(c->root_directory);
3587 c->root_image = mfree(c->root_image);
3588 c->tty_path = mfree(c->tty_path);
3589 c->syslog_identifier = mfree(c->syslog_identifier);
3590 c->user = mfree(c->user);
3591 c->group = mfree(c->group);
3592
3593 c->supplementary_groups = strv_free(c->supplementary_groups);
3594
3595 c->pam_name = mfree(c->pam_name);
3596
3597 c->read_only_paths = strv_free(c->read_only_paths);
3598 c->read_write_paths = strv_free(c->read_write_paths);
3599 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3600
3601 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3602 c->bind_mounts = NULL;
3603 c->n_bind_mounts = 0;
3604 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3605 c->temporary_filesystems = NULL;
3606 c->n_temporary_filesystems = 0;
3607
3608 c->cpuset = cpu_set_mfree(c->cpuset);
3609
3610 c->utmp_id = mfree(c->utmp_id);
3611 c->selinux_context = mfree(c->selinux_context);
3612 c->apparmor_profile = mfree(c->apparmor_profile);
3613 c->smack_process_label = mfree(c->smack_process_label);
3614
3615 c->syscall_filter = hashmap_free(c->syscall_filter);
3616 c->syscall_archs = set_free(c->syscall_archs);
3617 c->address_families = set_free(c->address_families);
3618
3619 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3620 c->directories[i].paths = strv_free(c->directories[i].paths);
3621
3622 c->log_level_max = -1;
3623
3624 exec_context_free_log_extra_fields(c);
3625
3626 c->stdin_data = mfree(c->stdin_data);
3627 c->stdin_data_size = 0;
3628 }
3629
3630 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3631 char **i;
3632
3633 assert(c);
3634
3635 if (!runtime_prefix)
3636 return 0;
3637
3638 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3639 _cleanup_free_ char *p;
3640
3641 p = strjoin(runtime_prefix, "/", *i);
3642 if (!p)
3643 return -ENOMEM;
3644
3645 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3646 * next. */
3647 (void) rm_rf(p, REMOVE_ROOT);
3648 }
3649
3650 return 0;
3651 }
3652
3653 static void exec_command_done(ExecCommand *c) {
3654 assert(c);
3655
3656 c->path = mfree(c->path);
3657
3658 c->argv = strv_free(c->argv);
3659 }
3660
3661 void exec_command_done_array(ExecCommand *c, size_t n) {
3662 size_t i;
3663
3664 for (i = 0; i < n; i++)
3665 exec_command_done(c+i);
3666 }
3667
3668 ExecCommand* exec_command_free_list(ExecCommand *c) {
3669 ExecCommand *i;
3670
3671 while ((i = c)) {
3672 LIST_REMOVE(command, c, i);
3673 exec_command_done(i);
3674 free(i);
3675 }
3676
3677 return NULL;
3678 }
3679
3680 void exec_command_free_array(ExecCommand **c, size_t n) {
3681 size_t i;
3682
3683 for (i = 0; i < n; i++)
3684 c[i] = exec_command_free_list(c[i]);
3685 }
3686
3687 typedef struct InvalidEnvInfo {
3688 const Unit *unit;
3689 const char *path;
3690 } InvalidEnvInfo;
3691
3692 static void invalid_env(const char *p, void *userdata) {
3693 InvalidEnvInfo *info = userdata;
3694
3695 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3696 }
3697
3698 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3699 assert(c);
3700
3701 switch (fd_index) {
3702
3703 case STDIN_FILENO:
3704 if (c->std_input != EXEC_INPUT_NAMED_FD)
3705 return NULL;
3706
3707 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3708
3709 case STDOUT_FILENO:
3710 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3711 return NULL;
3712
3713 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3714
3715 case STDERR_FILENO:
3716 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3717 return NULL;
3718
3719 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3720
3721 default:
3722 return NULL;
3723 }
3724 }
3725
3726 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3727 size_t i, targets;
3728 const char* stdio_fdname[3];
3729 size_t n_fds;
3730
3731 assert(c);
3732 assert(p);
3733
3734 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3735 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3736 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3737
3738 for (i = 0; i < 3; i++)
3739 stdio_fdname[i] = exec_context_fdname(c, i);
3740
3741 n_fds = p->n_storage_fds + p->n_socket_fds;
3742
3743 for (i = 0; i < n_fds && targets > 0; i++)
3744 if (named_iofds[STDIN_FILENO] < 0 &&
3745 c->std_input == EXEC_INPUT_NAMED_FD &&
3746 stdio_fdname[STDIN_FILENO] &&
3747 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3748
3749 named_iofds[STDIN_FILENO] = p->fds[i];
3750 targets--;
3751
3752 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3753 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3754 stdio_fdname[STDOUT_FILENO] &&
3755 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3756
3757 named_iofds[STDOUT_FILENO] = p->fds[i];
3758 targets--;
3759
3760 } else if (named_iofds[STDERR_FILENO] < 0 &&
3761 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3762 stdio_fdname[STDERR_FILENO] &&
3763 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3764
3765 named_iofds[STDERR_FILENO] = p->fds[i];
3766 targets--;
3767 }
3768
3769 return targets == 0 ? 0 : -ENOENT;
3770 }
3771
3772 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3773 char **i, **r = NULL;
3774
3775 assert(c);
3776 assert(l);
3777
3778 STRV_FOREACH(i, c->environment_files) {
3779 char *fn;
3780 int k;
3781 unsigned n;
3782 bool ignore = false;
3783 char **p;
3784 _cleanup_globfree_ glob_t pglob = {};
3785
3786 fn = *i;
3787
3788 if (fn[0] == '-') {
3789 ignore = true;
3790 fn++;
3791 }
3792
3793 if (!path_is_absolute(fn)) {
3794 if (ignore)
3795 continue;
3796
3797 strv_free(r);
3798 return -EINVAL;
3799 }
3800
3801 /* Filename supports globbing, take all matching files */
3802 k = safe_glob(fn, 0, &pglob);
3803 if (k < 0) {
3804 if (ignore)
3805 continue;
3806
3807 strv_free(r);
3808 return k;
3809 }
3810
3811 /* When we don't match anything, -ENOENT should be returned */
3812 assert(pglob.gl_pathc > 0);
3813
3814 for (n = 0; n < pglob.gl_pathc; n++) {
3815 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3816 if (k < 0) {
3817 if (ignore)
3818 continue;
3819
3820 strv_free(r);
3821 return k;
3822 }
3823 /* Log invalid environment variables with filename */
3824 if (p) {
3825 InvalidEnvInfo info = {
3826 .unit = unit,
3827 .path = pglob.gl_pathv[n]
3828 };
3829
3830 p = strv_env_clean_with_callback(p, invalid_env, &info);
3831 }
3832
3833 if (!r)
3834 r = p;
3835 else {
3836 char **m;
3837
3838 m = strv_env_merge(2, r, p);
3839 strv_free(r);
3840 strv_free(p);
3841 if (!m)
3842 return -ENOMEM;
3843
3844 r = m;
3845 }
3846 }
3847 }
3848
3849 *l = r;
3850
3851 return 0;
3852 }
3853
3854 static bool tty_may_match_dev_console(const char *tty) {
3855 _cleanup_free_ char *resolved = NULL;
3856
3857 if (!tty)
3858 return true;
3859
3860 tty = skip_dev_prefix(tty);
3861
3862 /* trivial identity? */
3863 if (streq(tty, "console"))
3864 return true;
3865
3866 if (resolve_dev_console(&resolved) < 0)
3867 return true; /* if we could not resolve, assume it may */
3868
3869 /* "tty0" means the active VC, so it may be the same sometimes */
3870 return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
3871 }
3872
3873 bool exec_context_may_touch_console(const ExecContext *ec) {
3874
3875 return (ec->tty_reset ||
3876 ec->tty_vhangup ||
3877 ec->tty_vt_disallocate ||
3878 is_terminal_input(ec->std_input) ||
3879 is_terminal_output(ec->std_output) ||
3880 is_terminal_output(ec->std_error)) &&
3881 tty_may_match_dev_console(exec_context_tty_path(ec));
3882 }
3883
3884 static void strv_fprintf(FILE *f, char **l) {
3885 char **g;
3886
3887 assert(f);
3888
3889 STRV_FOREACH(g, l)
3890 fprintf(f, " %s", *g);
3891 }
3892
3893 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
3894 ExecDirectoryType dt;
3895 char **e, **d;
3896 unsigned i;
3897 int r;
3898
3899 assert(c);
3900 assert(f);
3901
3902 prefix = strempty(prefix);
3903
3904 fprintf(f,
3905 "%sUMask: %04o\n"
3906 "%sWorkingDirectory: %s\n"
3907 "%sRootDirectory: %s\n"
3908 "%sNonBlocking: %s\n"
3909 "%sPrivateTmp: %s\n"
3910 "%sPrivateDevices: %s\n"
3911 "%sProtectKernelTunables: %s\n"
3912 "%sProtectKernelModules: %s\n"
3913 "%sProtectControlGroups: %s\n"
3914 "%sPrivateNetwork: %s\n"
3915 "%sPrivateUsers: %s\n"
3916 "%sProtectHome: %s\n"
3917 "%sProtectSystem: %s\n"
3918 "%sMountAPIVFS: %s\n"
3919 "%sIgnoreSIGPIPE: %s\n"
3920 "%sMemoryDenyWriteExecute: %s\n"
3921 "%sRestrictRealtime: %s\n"
3922 "%sKeyringMode: %s\n",
3923 prefix, c->umask,
3924 prefix, c->working_directory ? c->working_directory : "/",
3925 prefix, c->root_directory ? c->root_directory : "/",
3926 prefix, yes_no(c->non_blocking),
3927 prefix, yes_no(c->private_tmp),
3928 prefix, yes_no(c->private_devices),
3929 prefix, yes_no(c->protect_kernel_tunables),
3930 prefix, yes_no(c->protect_kernel_modules),
3931 prefix, yes_no(c->protect_control_groups),
3932 prefix, yes_no(c->private_network),
3933 prefix, yes_no(c->private_users),
3934 prefix, protect_home_to_string(c->protect_home),
3935 prefix, protect_system_to_string(c->protect_system),
3936 prefix, yes_no(c->mount_apivfs),
3937 prefix, yes_no(c->ignore_sigpipe),
3938 prefix, yes_no(c->memory_deny_write_execute),
3939 prefix, yes_no(c->restrict_realtime),
3940 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3941
3942 if (c->root_image)
3943 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3944
3945 STRV_FOREACH(e, c->environment)
3946 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3947
3948 STRV_FOREACH(e, c->environment_files)
3949 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3950
3951 STRV_FOREACH(e, c->pass_environment)
3952 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3953
3954 STRV_FOREACH(e, c->unset_environment)
3955 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3956
3957 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3958
3959 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3960 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3961
3962 STRV_FOREACH(d, c->directories[dt].paths)
3963 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3964 }
3965
3966 if (c->nice_set)
3967 fprintf(f,
3968 "%sNice: %i\n",
3969 prefix, c->nice);
3970
3971 if (c->oom_score_adjust_set)
3972 fprintf(f,
3973 "%sOOMScoreAdjust: %i\n",
3974 prefix, c->oom_score_adjust);
3975
3976 for (i = 0; i < RLIM_NLIMITS; i++)
3977 if (c->rlimit[i]) {
3978 fprintf(f, "Limit%s%s: " RLIM_FMT "\n",
3979 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3980 fprintf(f, "Limit%s%sSoft: " RLIM_FMT "\n",
3981 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3982 }
3983
3984 if (c->ioprio_set) {
3985 _cleanup_free_ char *class_str = NULL;
3986
3987 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3988 if (r >= 0)
3989 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3990
3991 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3992 }
3993
3994 if (c->cpu_sched_set) {
3995 _cleanup_free_ char *policy_str = NULL;
3996
3997 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3998 if (r >= 0)
3999 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4000
4001 fprintf(f,
4002 "%sCPUSchedulingPriority: %i\n"
4003 "%sCPUSchedulingResetOnFork: %s\n",
4004 prefix, c->cpu_sched_priority,
4005 prefix, yes_no(c->cpu_sched_reset_on_fork));
4006 }
4007
4008 if (c->cpuset) {
4009 fprintf(f, "%sCPUAffinity:", prefix);
4010 for (i = 0; i < c->cpuset_ncpus; i++)
4011 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4012 fprintf(f, " %u", i);
4013 fputs("\n", f);
4014 }
4015
4016 if (c->timer_slack_nsec != NSEC_INFINITY)
4017 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4018
4019 fprintf(f,
4020 "%sStandardInput: %s\n"
4021 "%sStandardOutput: %s\n"
4022 "%sStandardError: %s\n",
4023 prefix, exec_input_to_string(c->std_input),
4024 prefix, exec_output_to_string(c->std_output),
4025 prefix, exec_output_to_string(c->std_error));
4026
4027 if (c->std_input == EXEC_INPUT_NAMED_FD)
4028 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4029 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4030 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4031 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4032 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4033
4034 if (c->std_input == EXEC_INPUT_FILE)
4035 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4036 if (c->std_output == EXEC_OUTPUT_FILE)
4037 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4038 if (c->std_error == EXEC_OUTPUT_FILE)
4039 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4040
4041 if (c->tty_path)
4042 fprintf(f,
4043 "%sTTYPath: %s\n"
4044 "%sTTYReset: %s\n"
4045 "%sTTYVHangup: %s\n"
4046 "%sTTYVTDisallocate: %s\n",
4047 prefix, c->tty_path,
4048 prefix, yes_no(c->tty_reset),
4049 prefix, yes_no(c->tty_vhangup),
4050 prefix, yes_no(c->tty_vt_disallocate));
4051
4052 if (IN_SET(c->std_output,
4053 EXEC_OUTPUT_SYSLOG,
4054 EXEC_OUTPUT_KMSG,
4055 EXEC_OUTPUT_JOURNAL,
4056 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4057 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4058 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4059 IN_SET(c->std_error,
4060 EXEC_OUTPUT_SYSLOG,
4061 EXEC_OUTPUT_KMSG,
4062 EXEC_OUTPUT_JOURNAL,
4063 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4064 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4065 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4066
4067 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4068
4069 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4070 if (r >= 0)
4071 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4072
4073 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4074 if (r >= 0)
4075 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4076 }
4077
4078 if (c->log_level_max >= 0) {
4079 _cleanup_free_ char *t = NULL;
4080
4081 (void) log_level_to_string_alloc(c->log_level_max, &t);
4082
4083 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4084 }
4085
4086 if (c->n_log_extra_fields > 0) {
4087 size_t j;
4088
4089 for (j = 0; j < c->n_log_extra_fields; j++) {
4090 fprintf(f, "%sLogExtraFields: ", prefix);
4091 fwrite(c->log_extra_fields[j].iov_base,
4092 1, c->log_extra_fields[j].iov_len,
4093 f);
4094 fputc('\n', f);
4095 }
4096 }
4097
4098 if (c->secure_bits) {
4099 _cleanup_free_ char *str = NULL;
4100
4101 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4102 if (r >= 0)
4103 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4104 }
4105
4106 if (c->capability_bounding_set != CAP_ALL) {
4107 _cleanup_free_ char *str = NULL;
4108
4109 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4110 if (r >= 0)
4111 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4112 }
4113
4114 if (c->capability_ambient_set != 0) {
4115 _cleanup_free_ char *str = NULL;
4116
4117 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4118 if (r >= 0)
4119 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4120 }
4121
4122 if (c->user)
4123 fprintf(f, "%sUser: %s\n", prefix, c->user);
4124 if (c->group)
4125 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4126
4127 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4128
4129 if (!strv_isempty(c->supplementary_groups)) {
4130 fprintf(f, "%sSupplementaryGroups:", prefix);
4131 strv_fprintf(f, c->supplementary_groups);
4132 fputs("\n", f);
4133 }
4134
4135 if (c->pam_name)
4136 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4137
4138 if (!strv_isempty(c->read_write_paths)) {
4139 fprintf(f, "%sReadWritePaths:", prefix);
4140 strv_fprintf(f, c->read_write_paths);
4141 fputs("\n", f);
4142 }
4143
4144 if (!strv_isempty(c->read_only_paths)) {
4145 fprintf(f, "%sReadOnlyPaths:", prefix);
4146 strv_fprintf(f, c->read_only_paths);
4147 fputs("\n", f);
4148 }
4149
4150 if (!strv_isempty(c->inaccessible_paths)) {
4151 fprintf(f, "%sInaccessiblePaths:", prefix);
4152 strv_fprintf(f, c->inaccessible_paths);
4153 fputs("\n", f);
4154 }
4155
4156 if (c->n_bind_mounts > 0)
4157 for (i = 0; i < c->n_bind_mounts; i++)
4158 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4159 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4160 c->bind_mounts[i].ignore_enoent ? "-": "",
4161 c->bind_mounts[i].source,
4162 c->bind_mounts[i].destination,
4163 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4164
4165 if (c->n_temporary_filesystems > 0)
4166 for (i = 0; i < c->n_temporary_filesystems; i++) {
4167 TemporaryFileSystem *t = c->temporary_filesystems + i;
4168
4169 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4170 t->path,
4171 isempty(t->options) ? "" : ":",
4172 strempty(t->options));
4173 }
4174
4175 if (c->utmp_id)
4176 fprintf(f,
4177 "%sUtmpIdentifier: %s\n",
4178 prefix, c->utmp_id);
4179
4180 if (c->selinux_context)
4181 fprintf(f,
4182 "%sSELinuxContext: %s%s\n",
4183 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4184
4185 if (c->apparmor_profile)
4186 fprintf(f,
4187 "%sAppArmorProfile: %s%s\n",
4188 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4189
4190 if (c->smack_process_label)
4191 fprintf(f,
4192 "%sSmackProcessLabel: %s%s\n",
4193 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4194
4195 if (c->personality != PERSONALITY_INVALID)
4196 fprintf(f,
4197 "%sPersonality: %s\n",
4198 prefix, strna(personality_to_string(c->personality)));
4199
4200 fprintf(f,
4201 "%sLockPersonality: %s\n",
4202 prefix, yes_no(c->lock_personality));
4203
4204 if (c->syscall_filter) {
4205 #if HAVE_SECCOMP
4206 Iterator j;
4207 void *id, *val;
4208 bool first = true;
4209 #endif
4210
4211 fprintf(f,
4212 "%sSystemCallFilter: ",
4213 prefix);
4214
4215 if (!c->syscall_whitelist)
4216 fputc('~', f);
4217
4218 #if HAVE_SECCOMP
4219 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4220 _cleanup_free_ char *name = NULL;
4221 const char *errno_name = NULL;
4222 int num = PTR_TO_INT(val);
4223
4224 if (first)
4225 first = false;
4226 else
4227 fputc(' ', f);
4228
4229 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4230 fputs(strna(name), f);
4231
4232 if (num >= 0) {
4233 errno_name = errno_to_name(num);
4234 if (errno_name)
4235 fprintf(f, ":%s", errno_name);
4236 else
4237 fprintf(f, ":%d", num);
4238 }
4239 }
4240 #endif
4241
4242 fputc('\n', f);
4243 }
4244
4245 if (c->syscall_archs) {
4246 #if HAVE_SECCOMP
4247 Iterator j;
4248 void *id;
4249 #endif
4250
4251 fprintf(f,
4252 "%sSystemCallArchitectures:",
4253 prefix);
4254
4255 #if HAVE_SECCOMP
4256 SET_FOREACH(id, c->syscall_archs, j)
4257 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4258 #endif
4259 fputc('\n', f);
4260 }
4261
4262 if (exec_context_restrict_namespaces_set(c)) {
4263 _cleanup_free_ char *s = NULL;
4264
4265 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4266 if (r >= 0)
4267 fprintf(f, "%sRestrictNamespaces: %s\n",
4268 prefix, s);
4269 }
4270
4271 if (c->syscall_errno > 0) {
4272 const char *errno_name;
4273
4274 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4275
4276 errno_name = errno_to_name(c->syscall_errno);
4277 if (errno_name)
4278 fprintf(f, "%s\n", errno_name);
4279 else
4280 fprintf(f, "%d\n", c->syscall_errno);
4281 }
4282
4283 if (c->apparmor_profile)
4284 fprintf(f,
4285 "%sAppArmorProfile: %s%s\n",
4286 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4287 }
4288
4289 bool exec_context_maintains_privileges(const ExecContext *c) {
4290 assert(c);
4291
4292 /* Returns true if the process forked off would run under
4293 * an unchanged UID or as root. */
4294
4295 if (!c->user)
4296 return true;
4297
4298 if (streq(c->user, "root") || streq(c->user, "0"))
4299 return true;
4300
4301 return false;
4302 }
4303
4304 int exec_context_get_effective_ioprio(const ExecContext *c) {
4305 int p;
4306
4307 assert(c);
4308
4309 if (c->ioprio_set)
4310 return c->ioprio;
4311
4312 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4313 if (p < 0)
4314 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4315
4316 return p;
4317 }
4318
4319 void exec_context_free_log_extra_fields(ExecContext *c) {
4320 size_t l;
4321
4322 assert(c);
4323
4324 for (l = 0; l < c->n_log_extra_fields; l++)
4325 free(c->log_extra_fields[l].iov_base);
4326 c->log_extra_fields = mfree(c->log_extra_fields);
4327 c->n_log_extra_fields = 0;
4328 }
4329
4330 void exec_status_start(ExecStatus *s, pid_t pid) {
4331 assert(s);
4332
4333 zero(*s);
4334 s->pid = pid;
4335 dual_timestamp_get(&s->start_timestamp);
4336 }
4337
4338 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4339 assert(s);
4340
4341 if (s->pid && s->pid != pid)
4342 zero(*s);
4343
4344 s->pid = pid;
4345 dual_timestamp_get(&s->exit_timestamp);
4346
4347 s->code = code;
4348 s->status = status;
4349
4350 if (context) {
4351 if (context->utmp_id)
4352 utmp_put_dead_process(context->utmp_id, pid, code, status);
4353
4354 exec_context_tty_reset(context, NULL);
4355 }
4356 }
4357
4358 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4359 char buf[FORMAT_TIMESTAMP_MAX];
4360
4361 assert(s);
4362 assert(f);
4363
4364 if (s->pid <= 0)
4365 return;
4366
4367 prefix = strempty(prefix);
4368
4369 fprintf(f,
4370 "%sPID: "PID_FMT"\n",
4371 prefix, s->pid);
4372
4373 if (dual_timestamp_is_set(&s->start_timestamp))
4374 fprintf(f,
4375 "%sStart Timestamp: %s\n",
4376 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4377
4378 if (dual_timestamp_is_set(&s->exit_timestamp))
4379 fprintf(f,
4380 "%sExit Timestamp: %s\n"
4381 "%sExit Code: %s\n"
4382 "%sExit Status: %i\n",
4383 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4384 prefix, sigchld_code_to_string(s->code),
4385 prefix, s->status);
4386 }
4387
4388 static char *exec_command_line(char **argv) {
4389 size_t k;
4390 char *n, *p, **a;
4391 bool first = true;
4392
4393 assert(argv);
4394
4395 k = 1;
4396 STRV_FOREACH(a, argv)
4397 k += strlen(*a)+3;
4398
4399 n = new(char, k);
4400 if (!n)
4401 return NULL;
4402
4403 p = n;
4404 STRV_FOREACH(a, argv) {
4405
4406 if (!first)
4407 *(p++) = ' ';
4408 else
4409 first = false;
4410
4411 if (strpbrk(*a, WHITESPACE)) {
4412 *(p++) = '\'';
4413 p = stpcpy(p, *a);
4414 *(p++) = '\'';
4415 } else
4416 p = stpcpy(p, *a);
4417
4418 }
4419
4420 *p = 0;
4421
4422 /* FIXME: this doesn't really handle arguments that have
4423 * spaces and ticks in them */
4424
4425 return n;
4426 }
4427
4428 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4429 _cleanup_free_ char *cmd = NULL;
4430 const char *prefix2;
4431
4432 assert(c);
4433 assert(f);
4434
4435 prefix = strempty(prefix);
4436 prefix2 = strjoina(prefix, "\t");
4437
4438 cmd = exec_command_line(c->argv);
4439 fprintf(f,
4440 "%sCommand Line: %s\n",
4441 prefix, cmd ? cmd : strerror(ENOMEM));
4442
4443 exec_status_dump(&c->exec_status, f, prefix2);
4444 }
4445
4446 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4447 assert(f);
4448
4449 prefix = strempty(prefix);
4450
4451 LIST_FOREACH(command, c, c)
4452 exec_command_dump(c, f, prefix);
4453 }
4454
4455 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4456 ExecCommand *end;
4457
4458 assert(l);
4459 assert(e);
4460
4461 if (*l) {
4462 /* It's kind of important, that we keep the order here */
4463 LIST_FIND_TAIL(command, *l, end);
4464 LIST_INSERT_AFTER(command, *l, end, e);
4465 } else
4466 *l = e;
4467 }
4468
4469 int exec_command_set(ExecCommand *c, const char *path, ...) {
4470 va_list ap;
4471 char **l, *p;
4472
4473 assert(c);
4474 assert(path);
4475
4476 va_start(ap, path);
4477 l = strv_new_ap(path, ap);
4478 va_end(ap);
4479
4480 if (!l)
4481 return -ENOMEM;
4482
4483 p = strdup(path);
4484 if (!p) {
4485 strv_free(l);
4486 return -ENOMEM;
4487 }
4488
4489 free(c->path);
4490 c->path = p;
4491
4492 return strv_free_and_replace(c->argv, l);
4493 }
4494
4495 int exec_command_append(ExecCommand *c, const char *path, ...) {
4496 _cleanup_strv_free_ char **l = NULL;
4497 va_list ap;
4498 int r;
4499
4500 assert(c);
4501 assert(path);
4502
4503 va_start(ap, path);
4504 l = strv_new_ap(path, ap);
4505 va_end(ap);
4506
4507 if (!l)
4508 return -ENOMEM;
4509
4510 r = strv_extend_strv(&c->argv, l, false);
4511 if (r < 0)
4512 return r;
4513
4514 return 0;
4515 }
4516
4517 static void *remove_tmpdir_thread(void *p) {
4518 _cleanup_free_ char *path = p;
4519
4520 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4521 return NULL;
4522 }
4523
4524 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4525 int r;
4526
4527 if (!rt)
4528 return NULL;
4529
4530 if (rt->manager)
4531 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4532
4533 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4534 if (destroy && rt->tmp_dir) {
4535 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4536
4537 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4538 if (r < 0) {
4539 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4540 free(rt->tmp_dir);
4541 }
4542
4543 rt->tmp_dir = NULL;
4544 }
4545
4546 if (destroy && rt->var_tmp_dir) {
4547 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4548
4549 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4550 if (r < 0) {
4551 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4552 free(rt->var_tmp_dir);
4553 }
4554
4555 rt->var_tmp_dir = NULL;
4556 }
4557
4558 rt->id = mfree(rt->id);
4559 rt->tmp_dir = mfree(rt->tmp_dir);
4560 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4561 safe_close_pair(rt->netns_storage_socket);
4562 return mfree(rt);
4563 }
4564
4565 static void exec_runtime_freep(ExecRuntime **rt) {
4566 if (*rt)
4567 (void) exec_runtime_free(*rt, false);
4568 }
4569
4570 static int exec_runtime_allocate(ExecRuntime **rt) {
4571 assert(rt);
4572
4573 *rt = new0(ExecRuntime, 1);
4574 if (!*rt)
4575 return -ENOMEM;
4576
4577 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4578 return 0;
4579 }
4580
4581 static int exec_runtime_add(
4582 Manager *m,
4583 const char *id,
4584 const char *tmp_dir,
4585 const char *var_tmp_dir,
4586 const int netns_storage_socket[2],
4587 ExecRuntime **ret) {
4588
4589 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4590 int r;
4591
4592 assert(m);
4593 assert(id);
4594
4595 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4596 if (r < 0)
4597 return r;
4598
4599 r = exec_runtime_allocate(&rt);
4600 if (r < 0)
4601 return r;
4602
4603 rt->id = strdup(id);
4604 if (!rt->id)
4605 return -ENOMEM;
4606
4607 if (tmp_dir) {
4608 rt->tmp_dir = strdup(tmp_dir);
4609 if (!rt->tmp_dir)
4610 return -ENOMEM;
4611
4612 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4613 assert(var_tmp_dir);
4614 rt->var_tmp_dir = strdup(var_tmp_dir);
4615 if (!rt->var_tmp_dir)
4616 return -ENOMEM;
4617 }
4618
4619 if (netns_storage_socket) {
4620 rt->netns_storage_socket[0] = netns_storage_socket[0];
4621 rt->netns_storage_socket[1] = netns_storage_socket[1];
4622 }
4623
4624 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4625 if (r < 0)
4626 return r;
4627
4628 rt->manager = m;
4629
4630 if (ret)
4631 *ret = rt;
4632
4633 /* do not remove created ExecRuntime object when the operation succeeds. */
4634 rt = NULL;
4635 return 0;
4636 }
4637
4638 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4639 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4640 _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4641 int r;
4642
4643 assert(m);
4644 assert(c);
4645 assert(id);
4646
4647 /* It is not necessary to create ExecRuntime object. */
4648 if (!c->private_network && !c->private_tmp)
4649 return 0;
4650
4651 if (c->private_tmp) {
4652 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4653 if (r < 0)
4654 return r;
4655 }
4656
4657 if (c->private_network) {
4658 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4659 return -errno;
4660 }
4661
4662 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4663 if (r < 0)
4664 return r;
4665
4666 /* Avoid cleanup */
4667 netns_storage_socket[0] = -1;
4668 netns_storage_socket[1] = -1;
4669 return 1;
4670 }
4671
4672 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4673 ExecRuntime *rt;
4674 int r;
4675
4676 assert(m);
4677 assert(id);
4678 assert(ret);
4679
4680 rt = hashmap_get(m->exec_runtime_by_id, id);
4681 if (rt)
4682 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4683 goto ref;
4684
4685 if (!create)
4686 return 0;
4687
4688 /* If not found, then create a new object. */
4689 r = exec_runtime_make(m, c, id, &rt);
4690 if (r <= 0)
4691 /* When r == 0, it is not necessary to create ExecRuntime object. */
4692 return r;
4693
4694 ref:
4695 /* increment reference counter. */
4696 rt->n_ref++;
4697 *ret = rt;
4698 return 1;
4699 }
4700
4701 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4702 if (!rt)
4703 return NULL;
4704
4705 assert(rt->n_ref > 0);
4706
4707 rt->n_ref--;
4708 if (rt->n_ref > 0)
4709 return NULL;
4710
4711 return exec_runtime_free(rt, destroy);
4712 }
4713
4714 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4715 ExecRuntime *rt;
4716 Iterator i;
4717
4718 assert(m);
4719 assert(f);
4720 assert(fds);
4721
4722 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4723 fprintf(f, "exec-runtime=%s", rt->id);
4724
4725 if (rt->tmp_dir)
4726 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4727
4728 if (rt->var_tmp_dir)
4729 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4730
4731 if (rt->netns_storage_socket[0] >= 0) {
4732 int copy;
4733
4734 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4735 if (copy < 0)
4736 return copy;
4737
4738 fprintf(f, " netns-socket-0=%i", copy);
4739 }
4740
4741 if (rt->netns_storage_socket[1] >= 0) {
4742 int copy;
4743
4744 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4745 if (copy < 0)
4746 return copy;
4747
4748 fprintf(f, " netns-socket-1=%i", copy);
4749 }
4750
4751 fputc('\n', f);
4752 }
4753
4754 return 0;
4755 }
4756
4757 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4758 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4759 ExecRuntime *rt;
4760 int r;
4761
4762 /* This is for the migration from old (v237 or earlier) deserialization text.
4763 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4764 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4765 * so or not from the serialized text, then we always creates a new object owned by this. */
4766
4767 assert(u);
4768 assert(key);
4769 assert(value);
4770
4771 /* Manager manages ExecRuntime objects by the unit id.
4772 * So, we omit the serialized text when the unit does not have id (yet?)... */
4773 if (isempty(u->id)) {
4774 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4775 return 0;
4776 }
4777
4778 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4779 if (r < 0) {
4780 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4781 return 0;
4782 }
4783
4784 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4785 if (!rt) {
4786 r = exec_runtime_allocate(&rt_create);
4787 if (r < 0)
4788 return log_oom();
4789
4790 rt_create->id = strdup(u->id);
4791 if (!rt_create->id)
4792 return log_oom();
4793
4794 rt = rt_create;
4795 }
4796
4797 if (streq(key, "tmp-dir")) {
4798 char *copy;
4799
4800 copy = strdup(value);
4801 if (!copy)
4802 return log_oom();
4803
4804 free_and_replace(rt->tmp_dir, copy);
4805
4806 } else if (streq(key, "var-tmp-dir")) {
4807 char *copy;
4808
4809 copy = strdup(value);
4810 if (!copy)
4811 return log_oom();
4812
4813 free_and_replace(rt->var_tmp_dir, copy);
4814
4815 } else if (streq(key, "netns-socket-0")) {
4816 int fd;
4817
4818 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4819 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4820 return 0;
4821 }
4822
4823 safe_close(rt->netns_storage_socket[0]);
4824 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4825
4826 } else if (streq(key, "netns-socket-1")) {
4827 int fd;
4828
4829 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4830 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4831 return 0;
4832 }
4833
4834 safe_close(rt->netns_storage_socket[1]);
4835 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4836 } else
4837 return 0;
4838
4839 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4840 if (rt_create) {
4841 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4842 if (r < 0) {
4843 log_unit_debug_errno(u, r, "Failed to put runtime paramter to manager's storage: %m");
4844 return 0;
4845 }
4846
4847 rt_create->manager = u->manager;
4848
4849 /* Avoid cleanup */
4850 rt_create = NULL;
4851 }
4852
4853 return 1;
4854 }
4855
4856 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4857 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4858 int r, fd0 = -1, fd1 = -1;
4859 const char *p, *v = value;
4860 size_t n;
4861
4862 assert(m);
4863 assert(value);
4864 assert(fds);
4865
4866 n = strcspn(v, " ");
4867 id = strndupa(v, n);
4868 if (v[n] != ' ')
4869 goto finalize;
4870 p = v + n + 1;
4871
4872 v = startswith(p, "tmp-dir=");
4873 if (v) {
4874 n = strcspn(v, " ");
4875 tmp_dir = strndupa(v, n);
4876 if (v[n] != ' ')
4877 goto finalize;
4878 p = v + n + 1;
4879 }
4880
4881 v = startswith(p, "var-tmp-dir=");
4882 if (v) {
4883 n = strcspn(v, " ");
4884 var_tmp_dir = strndupa(v, n);
4885 if (v[n] != ' ')
4886 goto finalize;
4887 p = v + n + 1;
4888 }
4889
4890 v = startswith(p, "netns-socket-0=");
4891 if (v) {
4892 char *buf;
4893
4894 n = strcspn(v, " ");
4895 buf = strndupa(v, n);
4896 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
4897 log_debug("Unable to process exec-runtime netns fd specification.");
4898 return;
4899 }
4900 fd0 = fdset_remove(fds, fd0);
4901 if (v[n] != ' ')
4902 goto finalize;
4903 p = v + n + 1;
4904 }
4905
4906 v = startswith(p, "netns-socket-1=");
4907 if (v) {
4908 char *buf;
4909
4910 n = strcspn(v, " ");
4911 buf = strndupa(v, n);
4912 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
4913 log_debug("Unable to process exec-runtime netns fd specification.");
4914 return;
4915 }
4916 fd1 = fdset_remove(fds, fd1);
4917 }
4918
4919 finalize:
4920
4921 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
4922 if (r < 0) {
4923 log_debug_errno(r, "Failed to add exec-runtime: %m");
4924 return;
4925 }
4926 }
4927
4928 void exec_runtime_vacuum(Manager *m) {
4929 ExecRuntime *rt;
4930 Iterator i;
4931
4932 assert(m);
4933
4934 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
4935
4936 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4937 if (rt->n_ref > 0)
4938 continue;
4939
4940 (void) exec_runtime_free(rt, false);
4941 }
4942 }
4943
4944 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4945 [EXEC_INPUT_NULL] = "null",
4946 [EXEC_INPUT_TTY] = "tty",
4947 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4948 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4949 [EXEC_INPUT_SOCKET] = "socket",
4950 [EXEC_INPUT_NAMED_FD] = "fd",
4951 [EXEC_INPUT_DATA] = "data",
4952 [EXEC_INPUT_FILE] = "file",
4953 };
4954
4955 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4956
4957 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4958 [EXEC_OUTPUT_INHERIT] = "inherit",
4959 [EXEC_OUTPUT_NULL] = "null",
4960 [EXEC_OUTPUT_TTY] = "tty",
4961 [EXEC_OUTPUT_SYSLOG] = "syslog",
4962 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4963 [EXEC_OUTPUT_KMSG] = "kmsg",
4964 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4965 [EXEC_OUTPUT_JOURNAL] = "journal",
4966 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4967 [EXEC_OUTPUT_SOCKET] = "socket",
4968 [EXEC_OUTPUT_NAMED_FD] = "fd",
4969 [EXEC_OUTPUT_FILE] = "file",
4970 };
4971
4972 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4973
4974 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4975 [EXEC_UTMP_INIT] = "init",
4976 [EXEC_UTMP_LOGIN] = "login",
4977 [EXEC_UTMP_USER] = "user",
4978 };
4979
4980 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4981
4982 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4983 [EXEC_PRESERVE_NO] = "no",
4984 [EXEC_PRESERVE_YES] = "yes",
4985 [EXEC_PRESERVE_RESTART] = "restart",
4986 };
4987
4988 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4989
4990 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4991 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4992 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4993 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4994 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4995 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4996 };
4997
4998 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4999
5000 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5001 [EXEC_KEYRING_INHERIT] = "inherit",
5002 [EXEC_KEYRING_PRIVATE] = "private",
5003 [EXEC_KEYRING_SHARED] = "shared",
5004 };
5005
5006 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);