]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
core: usually our enum's _INVALID and _MAX special values are named after the full...
[thirdparty/systemd.git] / src / core / execute.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <glob.h>
23 #include <grp.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <string.h>
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
29 #include <sys/mman.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
32 #include <sys/shm.h>
33 #include <sys/socket.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <sys/un.h>
37 #include <unistd.h>
38 #include <utmpx.h>
39
40 #ifdef HAVE_PAM
41 #include <security/pam_appl.h>
42 #endif
43
44 #ifdef HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #ifdef HAVE_SECCOMP
49 #include <seccomp.h>
50 #endif
51
52 #ifdef HAVE_APPARMOR
53 #include <sys/apparmor.h>
54 #endif
55
56 #include "sd-messages.h"
57
58 #include "af-list.h"
59 #include "alloc-util.h"
60 #ifdef HAVE_APPARMOR
61 #include "apparmor-util.h"
62 #endif
63 #include "async.h"
64 #include "barrier.h"
65 #include "cap-list.h"
66 #include "capability-util.h"
67 #include "chown-recursive.h"
68 #include "def.h"
69 #include "env-util.h"
70 #include "errno-list.h"
71 #include "execute.h"
72 #include "exit-status.h"
73 #include "fd-util.h"
74 #include "fileio.h"
75 #include "format-util.h"
76 #include "fs-util.h"
77 #include "glob-util.h"
78 #include "io-util.h"
79 #include "ioprio.h"
80 #include "label.h"
81 #include "log.h"
82 #include "macro.h"
83 #include "missing.h"
84 #include "mkdir.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "process-util.h"
89 #include "rlimit-util.h"
90 #include "rm-rf.h"
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "securebits.h"
95 #include "securebits-util.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "smack-util.h"
99 #include "special.h"
100 #include "string-table.h"
101 #include "string-util.h"
102 #include "strv.h"
103 #include "syslog-util.h"
104 #include "terminal-util.h"
105 #include "unit.h"
106 #include "user-util.h"
107 #include "util.h"
108 #include "utmp-wtmp.h"
109
110 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
111 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
112
113 /* This assumes there is a 'tty' group */
114 #define TTY_MODE 0620
115
116 #define SNDBUF_SIZE (8*1024*1024)
117
118 static int shift_fds(int fds[], unsigned n_fds) {
119 int start, restart_from;
120
121 if (n_fds <= 0)
122 return 0;
123
124 /* Modifies the fds array! (sorts it) */
125
126 assert(fds);
127
128 start = 0;
129 for (;;) {
130 int i;
131
132 restart_from = -1;
133
134 for (i = start; i < (int) n_fds; i++) {
135 int nfd;
136
137 /* Already at right index? */
138 if (fds[i] == i+3)
139 continue;
140
141 nfd = fcntl(fds[i], F_DUPFD, i + 3);
142 if (nfd < 0)
143 return -errno;
144
145 safe_close(fds[i]);
146 fds[i] = nfd;
147
148 /* Hmm, the fd we wanted isn't free? Then
149 * let's remember that and try again from here */
150 if (nfd != i+3 && restart_from < 0)
151 restart_from = i;
152 }
153
154 if (restart_from < 0)
155 break;
156
157 start = restart_from;
158 }
159
160 return 0;
161 }
162
163 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
164 unsigned i, n_fds;
165 int r;
166
167 n_fds = n_storage_fds + n_socket_fds;
168 if (n_fds <= 0)
169 return 0;
170
171 assert(fds);
172
173 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
174 * O_NONBLOCK only applies to socket activation though. */
175
176 for (i = 0; i < n_fds; i++) {
177
178 if (i < n_socket_fds) {
179 r = fd_nonblock(fds[i], nonblock);
180 if (r < 0)
181 return r;
182 }
183
184 /* We unconditionally drop FD_CLOEXEC from the fds,
185 * since after all we want to pass these fds to our
186 * children */
187
188 r = fd_cloexec(fds[i], false);
189 if (r < 0)
190 return r;
191 }
192
193 return 0;
194 }
195
196 static const char *exec_context_tty_path(const ExecContext *context) {
197 assert(context);
198
199 if (context->stdio_as_fds)
200 return NULL;
201
202 if (context->tty_path)
203 return context->tty_path;
204
205 return "/dev/console";
206 }
207
208 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
209 const char *path;
210
211 assert(context);
212
213 path = exec_context_tty_path(context);
214
215 if (context->tty_vhangup) {
216 if (p && p->stdin_fd >= 0)
217 (void) terminal_vhangup_fd(p->stdin_fd);
218 else if (path)
219 (void) terminal_vhangup(path);
220 }
221
222 if (context->tty_reset) {
223 if (p && p->stdin_fd >= 0)
224 (void) reset_terminal_fd(p->stdin_fd, true);
225 else if (path)
226 (void) reset_terminal(path);
227 }
228
229 if (context->tty_vt_disallocate && path)
230 (void) vt_disallocate(path);
231 }
232
233 static bool is_terminal_input(ExecInput i) {
234 return IN_SET(i,
235 EXEC_INPUT_TTY,
236 EXEC_INPUT_TTY_FORCE,
237 EXEC_INPUT_TTY_FAIL);
238 }
239
240 static bool is_terminal_output(ExecOutput o) {
241 return IN_SET(o,
242 EXEC_OUTPUT_TTY,
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
244 EXEC_OUTPUT_KMSG_AND_CONSOLE,
245 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
246 }
247
248 static bool is_syslog_output(ExecOutput o) {
249 return IN_SET(o,
250 EXEC_OUTPUT_SYSLOG,
251 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
252 }
253
254 static bool is_kmsg_output(ExecOutput o) {
255 return IN_SET(o,
256 EXEC_OUTPUT_KMSG,
257 EXEC_OUTPUT_KMSG_AND_CONSOLE);
258 }
259
260 static bool exec_context_needs_term(const ExecContext *c) {
261 assert(c);
262
263 /* Return true if the execution context suggests we should set $TERM to something useful. */
264
265 if (is_terminal_input(c->std_input))
266 return true;
267
268 if (is_terminal_output(c->std_output))
269 return true;
270
271 if (is_terminal_output(c->std_error))
272 return true;
273
274 return !!c->tty_path;
275 }
276
277 static int open_null_as(int flags, int nfd) {
278 int fd, r;
279
280 assert(nfd >= 0);
281
282 fd = open("/dev/null", flags|O_NOCTTY);
283 if (fd < 0)
284 return -errno;
285
286 if (fd != nfd) {
287 r = dup2(fd, nfd) < 0 ? -errno : nfd;
288 safe_close(fd);
289 } else
290 r = nfd;
291
292 return r;
293 }
294
295 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
296 static const union sockaddr_union sa = {
297 .un.sun_family = AF_UNIX,
298 .un.sun_path = "/run/systemd/journal/stdout",
299 };
300 uid_t olduid = UID_INVALID;
301 gid_t oldgid = GID_INVALID;
302 int r;
303
304 if (gid_is_valid(gid)) {
305 oldgid = getgid();
306
307 if (setegid(gid) < 0)
308 return -errno;
309 }
310
311 if (uid_is_valid(uid)) {
312 olduid = getuid();
313
314 if (seteuid(uid) < 0) {
315 r = -errno;
316 goto restore_gid;
317 }
318 }
319
320 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
321
322 /* If we fail to restore the uid or gid, things will likely
323 fail later on. This should only happen if an LSM interferes. */
324
325 if (uid_is_valid(uid))
326 (void) seteuid(olduid);
327
328 restore_gid:
329 if (gid_is_valid(gid))
330 (void) setegid(oldgid);
331
332 return r;
333 }
334
335 static int connect_logger_as(
336 Unit *unit,
337 const ExecContext *context,
338 const ExecParameters *params,
339 ExecOutput output,
340 const char *ident,
341 int nfd,
342 uid_t uid,
343 gid_t gid) {
344
345 int fd, r;
346
347 assert(context);
348 assert(params);
349 assert(output < _EXEC_OUTPUT_MAX);
350 assert(ident);
351 assert(nfd >= 0);
352
353 fd = socket(AF_UNIX, SOCK_STREAM, 0);
354 if (fd < 0)
355 return -errno;
356
357 r = connect_journal_socket(fd, uid, gid);
358 if (r < 0)
359 return r;
360
361 if (shutdown(fd, SHUT_RD) < 0) {
362 safe_close(fd);
363 return -errno;
364 }
365
366 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
367
368 dprintf(fd,
369 "%s\n"
370 "%s\n"
371 "%i\n"
372 "%i\n"
373 "%i\n"
374 "%i\n"
375 "%i\n",
376 context->syslog_identifier ?: ident,
377 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
378 context->syslog_priority,
379 !!context->syslog_level_prefix,
380 is_syslog_output(output),
381 is_kmsg_output(output),
382 is_terminal_output(output));
383
384 if (fd == nfd)
385 return nfd;
386
387 r = dup2(fd, nfd) < 0 ? -errno : nfd;
388 safe_close(fd);
389
390 return r;
391 }
392 static int open_terminal_as(const char *path, mode_t mode, int nfd) {
393 int fd, r;
394
395 assert(path);
396 assert(nfd >= 0);
397
398 fd = open_terminal(path, mode | O_NOCTTY);
399 if (fd < 0)
400 return fd;
401
402 if (fd != nfd) {
403 r = dup2(fd, nfd) < 0 ? -errno : nfd;
404 safe_close(fd);
405 } else
406 r = nfd;
407
408 return r;
409 }
410
411 static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
412
413 if (is_terminal_input(std_input) && !apply_tty_stdin)
414 return EXEC_INPUT_NULL;
415
416 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
417 return EXEC_INPUT_NULL;
418
419 return std_input;
420 }
421
422 static int fixup_output(ExecOutput std_output, int socket_fd) {
423
424 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
425 return EXEC_OUTPUT_INHERIT;
426
427 return std_output;
428 }
429
430 static int setup_input(
431 const ExecContext *context,
432 const ExecParameters *params,
433 int socket_fd,
434 int named_iofds[3]) {
435
436 ExecInput i;
437
438 assert(context);
439 assert(params);
440
441 if (params->stdin_fd >= 0) {
442 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
443 return -errno;
444
445 /* Try to make this the controlling tty, if it is a tty, and reset it */
446 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
447 (void) reset_terminal_fd(STDIN_FILENO, true);
448
449 return STDIN_FILENO;
450 }
451
452 i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
453
454 switch (i) {
455
456 case EXEC_INPUT_NULL:
457 return open_null_as(O_RDONLY, STDIN_FILENO);
458
459 case EXEC_INPUT_TTY:
460 case EXEC_INPUT_TTY_FORCE:
461 case EXEC_INPUT_TTY_FAIL: {
462 int fd, r;
463
464 fd = acquire_terminal(exec_context_tty_path(context),
465 i == EXEC_INPUT_TTY_FAIL,
466 i == EXEC_INPUT_TTY_FORCE,
467 false,
468 USEC_INFINITY);
469 if (fd < 0)
470 return fd;
471
472 if (fd != STDIN_FILENO) {
473 r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
474 safe_close(fd);
475 } else
476 r = STDIN_FILENO;
477
478 return r;
479 }
480
481 case EXEC_INPUT_SOCKET:
482 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
483
484 case EXEC_INPUT_NAMED_FD:
485 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
486 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
487
488 default:
489 assert_not_reached("Unknown input type");
490 }
491 }
492
493 static int setup_output(
494 Unit *unit,
495 const ExecContext *context,
496 const ExecParameters *params,
497 int fileno,
498 int socket_fd,
499 int named_iofds[3],
500 const char *ident,
501 uid_t uid,
502 gid_t gid,
503 dev_t *journal_stream_dev,
504 ino_t *journal_stream_ino) {
505
506 ExecOutput o;
507 ExecInput i;
508 int r;
509
510 assert(unit);
511 assert(context);
512 assert(params);
513 assert(ident);
514 assert(journal_stream_dev);
515 assert(journal_stream_ino);
516
517 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
518
519 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
520 return -errno;
521
522 return STDOUT_FILENO;
523 }
524
525 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
526 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
527 return -errno;
528
529 return STDERR_FILENO;
530 }
531
532 i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
533 o = fixup_output(context->std_output, socket_fd);
534
535 if (fileno == STDERR_FILENO) {
536 ExecOutput e;
537 e = fixup_output(context->std_error, socket_fd);
538
539 /* This expects the input and output are already set up */
540
541 /* Don't change the stderr file descriptor if we inherit all
542 * the way and are not on a tty */
543 if (e == EXEC_OUTPUT_INHERIT &&
544 o == EXEC_OUTPUT_INHERIT &&
545 i == EXEC_INPUT_NULL &&
546 !is_terminal_input(context->std_input) &&
547 getppid () != 1)
548 return fileno;
549
550 /* Duplicate from stdout if possible */
551 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
552 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
553
554 o = e;
555
556 } else if (o == EXEC_OUTPUT_INHERIT) {
557 /* If input got downgraded, inherit the original value */
558 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
559 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
560
561 /* If the input is connected to anything that's not a /dev/null, inherit that... */
562 if (i != EXEC_INPUT_NULL)
563 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
564
565 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
566 if (getppid() != 1)
567 return fileno;
568
569 /* We need to open /dev/null here anew, to get the right access mode. */
570 return open_null_as(O_WRONLY, fileno);
571 }
572
573 switch (o) {
574
575 case EXEC_OUTPUT_NULL:
576 return open_null_as(O_WRONLY, fileno);
577
578 case EXEC_OUTPUT_TTY:
579 if (is_terminal_input(i))
580 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
581
582 /* We don't reset the terminal if this is just about output */
583 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
584
585 case EXEC_OUTPUT_SYSLOG:
586 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
587 case EXEC_OUTPUT_KMSG:
588 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
589 case EXEC_OUTPUT_JOURNAL:
590 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
591 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
592 if (r < 0) {
593 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
594 r = open_null_as(O_WRONLY, fileno);
595 } else {
596 struct stat st;
597
598 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
599 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
600 * services to detect whether they are connected to the journal or not.
601 *
602 * If both stdout and stderr are connected to a stream then let's make sure to store the data
603 * about STDERR as that's usually the best way to do logging. */
604
605 if (fstat(fileno, &st) >= 0 &&
606 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
607 *journal_stream_dev = st.st_dev;
608 *journal_stream_ino = st.st_ino;
609 }
610 }
611 return r;
612
613 case EXEC_OUTPUT_SOCKET:
614 assert(socket_fd >= 0);
615 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
616
617 case EXEC_OUTPUT_NAMED_FD:
618 (void) fd_nonblock(named_iofds[fileno], false);
619 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
620
621 default:
622 assert_not_reached("Unknown error type");
623 }
624 }
625
626 static int chown_terminal(int fd, uid_t uid) {
627 struct stat st;
628
629 assert(fd >= 0);
630
631 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
632 if (isatty(fd) < 1)
633 return 0;
634
635 /* This might fail. What matters are the results. */
636 (void) fchown(fd, uid, -1);
637 (void) fchmod(fd, TTY_MODE);
638
639 if (fstat(fd, &st) < 0)
640 return -errno;
641
642 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
643 return -EPERM;
644
645 return 0;
646 }
647
648 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
649 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
650 int r;
651
652 assert(_saved_stdin);
653 assert(_saved_stdout);
654
655 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
656 if (saved_stdin < 0)
657 return -errno;
658
659 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
660 if (saved_stdout < 0)
661 return -errno;
662
663 fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
664 if (fd < 0)
665 return fd;
666
667 r = chown_terminal(fd, getuid());
668 if (r < 0)
669 return r;
670
671 r = reset_terminal_fd(fd, true);
672 if (r < 0)
673 return r;
674
675 if (dup2(fd, STDIN_FILENO) < 0)
676 return -errno;
677
678 if (dup2(fd, STDOUT_FILENO) < 0)
679 return -errno;
680
681 if (fd >= 2)
682 safe_close(fd);
683 fd = -1;
684
685 *_saved_stdin = saved_stdin;
686 *_saved_stdout = saved_stdout;
687
688 saved_stdin = saved_stdout = -1;
689
690 return 0;
691 }
692
693 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
694 assert(err < 0);
695
696 if (err == -ETIMEDOUT)
697 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
698 else {
699 errno = -err;
700 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
701 }
702 }
703
704 static void write_confirm_error(int err, const char *vc, const Unit *u) {
705 _cleanup_close_ int fd = -1;
706
707 assert(vc);
708
709 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
710 if (fd < 0)
711 return;
712
713 write_confirm_error_fd(err, fd, u);
714 }
715
716 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
717 int r = 0;
718
719 assert(saved_stdin);
720 assert(saved_stdout);
721
722 release_terminal();
723
724 if (*saved_stdin >= 0)
725 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
726 r = -errno;
727
728 if (*saved_stdout >= 0)
729 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
730 r = -errno;
731
732 *saved_stdin = safe_close(*saved_stdin);
733 *saved_stdout = safe_close(*saved_stdout);
734
735 return r;
736 }
737
738 enum {
739 CONFIRM_PRETEND_FAILURE = -1,
740 CONFIRM_PRETEND_SUCCESS = 0,
741 CONFIRM_EXECUTE = 1,
742 };
743
744 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
745 int saved_stdout = -1, saved_stdin = -1, r;
746 _cleanup_free_ char *e = NULL;
747 char c;
748
749 /* For any internal errors, assume a positive response. */
750 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
751 if (r < 0) {
752 write_confirm_error(r, vc, u);
753 return CONFIRM_EXECUTE;
754 }
755
756 /* confirm_spawn might have been disabled while we were sleeping. */
757 if (manager_is_confirm_spawn_disabled(u->manager)) {
758 r = 1;
759 goto restore_stdio;
760 }
761
762 e = ellipsize(cmdline, 60, 100);
763 if (!e) {
764 log_oom();
765 r = CONFIRM_EXECUTE;
766 goto restore_stdio;
767 }
768
769 for (;;) {
770 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
771 if (r < 0) {
772 write_confirm_error_fd(r, STDOUT_FILENO, u);
773 r = CONFIRM_EXECUTE;
774 goto restore_stdio;
775 }
776
777 switch (c) {
778 case 'c':
779 printf("Resuming normal execution.\n");
780 manager_disable_confirm_spawn();
781 r = 1;
782 break;
783 case 'D':
784 unit_dump(u, stdout, " ");
785 continue; /* ask again */
786 case 'f':
787 printf("Failing execution.\n");
788 r = CONFIRM_PRETEND_FAILURE;
789 break;
790 case 'h':
791 printf(" c - continue, proceed without asking anymore\n"
792 " D - dump, show the state of the unit\n"
793 " f - fail, don't execute the command and pretend it failed\n"
794 " h - help\n"
795 " i - info, show a short summary of the unit\n"
796 " j - jobs, show jobs that are in progress\n"
797 " s - skip, don't execute the command and pretend it succeeded\n"
798 " y - yes, execute the command\n");
799 continue; /* ask again */
800 case 'i':
801 printf(" Description: %s\n"
802 " Unit: %s\n"
803 " Command: %s\n",
804 u->id, u->description, cmdline);
805 continue; /* ask again */
806 case 'j':
807 manager_dump_jobs(u->manager, stdout, " ");
808 continue; /* ask again */
809 case 'n':
810 /* 'n' was removed in favor of 'f'. */
811 printf("Didn't understand 'n', did you mean 'f'?\n");
812 continue; /* ask again */
813 case 's':
814 printf("Skipping execution.\n");
815 r = CONFIRM_PRETEND_SUCCESS;
816 break;
817 case 'y':
818 r = CONFIRM_EXECUTE;
819 break;
820 default:
821 assert_not_reached("Unhandled choice");
822 }
823 break;
824 }
825
826 restore_stdio:
827 restore_confirm_stdio(&saved_stdin, &saved_stdout);
828 return r;
829 }
830
831 static int get_fixed_user(const ExecContext *c, const char **user,
832 uid_t *uid, gid_t *gid,
833 const char **home, const char **shell) {
834 int r;
835 const char *name;
836
837 assert(c);
838
839 if (!c->user)
840 return 0;
841
842 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
843 * (i.e. are "/" or "/bin/nologin"). */
844
845 name = c->user;
846 r = get_user_creds_clean(&name, uid, gid, home, shell);
847 if (r < 0)
848 return r;
849
850 *user = name;
851 return 0;
852 }
853
854 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
855 int r;
856 const char *name;
857
858 assert(c);
859
860 if (!c->group)
861 return 0;
862
863 name = c->group;
864 r = get_group_creds(&name, gid);
865 if (r < 0)
866 return r;
867
868 *group = name;
869 return 0;
870 }
871
872 static int get_supplementary_groups(const ExecContext *c, const char *user,
873 const char *group, gid_t gid,
874 gid_t **supplementary_gids, int *ngids) {
875 char **i;
876 int r, k = 0;
877 int ngroups_max;
878 bool keep_groups = false;
879 gid_t *groups = NULL;
880 _cleanup_free_ gid_t *l_gids = NULL;
881
882 assert(c);
883
884 /*
885 * If user is given, then lookup GID and supplementary groups list.
886 * We avoid NSS lookups for gid=0. Also we have to initialize groups
887 * here and as early as possible so we keep the list of supplementary
888 * groups of the caller.
889 */
890 if (user && gid_is_valid(gid) && gid != 0) {
891 /* First step, initialize groups from /etc/groups */
892 if (initgroups(user, gid) < 0)
893 return -errno;
894
895 keep_groups = true;
896 }
897
898 if (!c->supplementary_groups)
899 return 0;
900
901 /*
902 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
903 * be positive, otherwise fail.
904 */
905 errno = 0;
906 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
907 if (ngroups_max <= 0) {
908 if (errno > 0)
909 return -errno;
910 else
911 return -EOPNOTSUPP; /* For all other values */
912 }
913
914 l_gids = new(gid_t, ngroups_max);
915 if (!l_gids)
916 return -ENOMEM;
917
918 if (keep_groups) {
919 /*
920 * Lookup the list of groups that the user belongs to, we
921 * avoid NSS lookups here too for gid=0.
922 */
923 k = ngroups_max;
924 if (getgrouplist(user, gid, l_gids, &k) < 0)
925 return -EINVAL;
926 } else
927 k = 0;
928
929 STRV_FOREACH(i, c->supplementary_groups) {
930 const char *g;
931
932 if (k >= ngroups_max)
933 return -E2BIG;
934
935 g = *i;
936 r = get_group_creds(&g, l_gids+k);
937 if (r < 0)
938 return r;
939
940 k++;
941 }
942
943 /*
944 * Sets ngids to zero to drop all supplementary groups, happens
945 * when we are under root and SupplementaryGroups= is empty.
946 */
947 if (k == 0) {
948 *ngids = 0;
949 return 0;
950 }
951
952 /* Otherwise get the final list of supplementary groups */
953 groups = memdup(l_gids, sizeof(gid_t) * k);
954 if (!groups)
955 return -ENOMEM;
956
957 *supplementary_gids = groups;
958 *ngids = k;
959
960 groups = NULL;
961
962 return 0;
963 }
964
965 static int enforce_groups(const ExecContext *context, gid_t gid,
966 gid_t *supplementary_gids, int ngids) {
967 int r;
968
969 assert(context);
970
971 /* Handle SupplementaryGroups= even if it is empty */
972 if (context->supplementary_groups) {
973 r = maybe_setgroups(ngids, supplementary_gids);
974 if (r < 0)
975 return r;
976 }
977
978 if (gid_is_valid(gid)) {
979 /* Then set our gids */
980 if (setresgid(gid, gid, gid) < 0)
981 return -errno;
982 }
983
984 return 0;
985 }
986
987 static int enforce_user(const ExecContext *context, uid_t uid) {
988 assert(context);
989
990 if (!uid_is_valid(uid))
991 return 0;
992
993 /* Sets (but doesn't look up) the uid and make sure we keep the
994 * capabilities while doing so. */
995
996 if (context->capability_ambient_set != 0) {
997
998 /* First step: If we need to keep capabilities but
999 * drop privileges we need to make sure we keep our
1000 * caps, while we drop privileges. */
1001 if (uid != 0) {
1002 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1003
1004 if (prctl(PR_GET_SECUREBITS) != sb)
1005 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1006 return -errno;
1007 }
1008 }
1009
1010 /* Second step: actually set the uids */
1011 if (setresuid(uid, uid, uid) < 0)
1012 return -errno;
1013
1014 /* At this point we should have all necessary capabilities but
1015 are otherwise a normal user. However, the caps might got
1016 corrupted due to the setresuid() so we need clean them up
1017 later. This is done outside of this call. */
1018
1019 return 0;
1020 }
1021
1022 #ifdef HAVE_PAM
1023
1024 static int null_conv(
1025 int num_msg,
1026 const struct pam_message **msg,
1027 struct pam_response **resp,
1028 void *appdata_ptr) {
1029
1030 /* We don't support conversations */
1031
1032 return PAM_CONV_ERR;
1033 }
1034
1035 #endif
1036
1037 static int setup_pam(
1038 const char *name,
1039 const char *user,
1040 uid_t uid,
1041 gid_t gid,
1042 const char *tty,
1043 char ***env,
1044 int fds[], unsigned n_fds) {
1045
1046 #ifdef HAVE_PAM
1047
1048 static const struct pam_conv conv = {
1049 .conv = null_conv,
1050 .appdata_ptr = NULL
1051 };
1052
1053 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1054 pam_handle_t *handle = NULL;
1055 sigset_t old_ss;
1056 int pam_code = PAM_SUCCESS, r;
1057 char **nv, **e = NULL;
1058 bool close_session = false;
1059 pid_t pam_pid = 0, parent_pid;
1060 int flags = 0;
1061
1062 assert(name);
1063 assert(user);
1064 assert(env);
1065
1066 /* We set up PAM in the parent process, then fork. The child
1067 * will then stay around until killed via PR_GET_PDEATHSIG or
1068 * systemd via the cgroup logic. It will then remove the PAM
1069 * session again. The parent process will exec() the actual
1070 * daemon. We do things this way to ensure that the main PID
1071 * of the daemon is the one we initially fork()ed. */
1072
1073 r = barrier_create(&barrier);
1074 if (r < 0)
1075 goto fail;
1076
1077 if (log_get_max_level() < LOG_DEBUG)
1078 flags |= PAM_SILENT;
1079
1080 pam_code = pam_start(name, user, &conv, &handle);
1081 if (pam_code != PAM_SUCCESS) {
1082 handle = NULL;
1083 goto fail;
1084 }
1085
1086 if (tty) {
1087 pam_code = pam_set_item(handle, PAM_TTY, tty);
1088 if (pam_code != PAM_SUCCESS)
1089 goto fail;
1090 }
1091
1092 STRV_FOREACH(nv, *env) {
1093 pam_code = pam_putenv(handle, *nv);
1094 if (pam_code != PAM_SUCCESS)
1095 goto fail;
1096 }
1097
1098 pam_code = pam_acct_mgmt(handle, flags);
1099 if (pam_code != PAM_SUCCESS)
1100 goto fail;
1101
1102 pam_code = pam_open_session(handle, flags);
1103 if (pam_code != PAM_SUCCESS)
1104 goto fail;
1105
1106 close_session = true;
1107
1108 e = pam_getenvlist(handle);
1109 if (!e) {
1110 pam_code = PAM_BUF_ERR;
1111 goto fail;
1112 }
1113
1114 /* Block SIGTERM, so that we know that it won't get lost in
1115 * the child */
1116
1117 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1118
1119 parent_pid = getpid_cached();
1120
1121 pam_pid = fork();
1122 if (pam_pid < 0) {
1123 r = -errno;
1124 goto fail;
1125 }
1126
1127 if (pam_pid == 0) {
1128 int sig, ret = EXIT_PAM;
1129
1130 /* The child's job is to reset the PAM session on
1131 * termination */
1132 barrier_set_role(&barrier, BARRIER_CHILD);
1133
1134 /* This string must fit in 10 chars (i.e. the length
1135 * of "/sbin/init"), to look pretty in /bin/ps */
1136 rename_process("(sd-pam)");
1137
1138 /* Make sure we don't keep open the passed fds in this
1139 child. We assume that otherwise only those fds are
1140 open here that have been opened by PAM. */
1141 close_many(fds, n_fds);
1142
1143 /* Drop privileges - we don't need any to pam_close_session
1144 * and this will make PR_SET_PDEATHSIG work in most cases.
1145 * If this fails, ignore the error - but expect sd-pam threads
1146 * to fail to exit normally */
1147
1148 r = maybe_setgroups(0, NULL);
1149 if (r < 0)
1150 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1151 if (setresgid(gid, gid, gid) < 0)
1152 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1153 if (setresuid(uid, uid, uid) < 0)
1154 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1155
1156 (void) ignore_signals(SIGPIPE, -1);
1157
1158 /* Wait until our parent died. This will only work if
1159 * the above setresuid() succeeds, otherwise the kernel
1160 * will not allow unprivileged parents kill their privileged
1161 * children this way. We rely on the control groups kill logic
1162 * to do the rest for us. */
1163 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1164 goto child_finish;
1165
1166 /* Tell the parent that our setup is done. This is especially
1167 * important regarding dropping privileges. Otherwise, unit
1168 * setup might race against our setresuid(2) call.
1169 *
1170 * If the parent aborted, we'll detect this below, hence ignore
1171 * return failure here. */
1172 (void) barrier_place(&barrier);
1173
1174 /* Check if our parent process might already have died? */
1175 if (getppid() == parent_pid) {
1176 sigset_t ss;
1177
1178 assert_se(sigemptyset(&ss) >= 0);
1179 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1180
1181 for (;;) {
1182 if (sigwait(&ss, &sig) < 0) {
1183 if (errno == EINTR)
1184 continue;
1185
1186 goto child_finish;
1187 }
1188
1189 assert(sig == SIGTERM);
1190 break;
1191 }
1192 }
1193
1194 /* If our parent died we'll end the session */
1195 if (getppid() != parent_pid) {
1196 pam_code = pam_close_session(handle, flags);
1197 if (pam_code != PAM_SUCCESS)
1198 goto child_finish;
1199 }
1200
1201 ret = 0;
1202
1203 child_finish:
1204 pam_end(handle, pam_code | flags);
1205 _exit(ret);
1206 }
1207
1208 barrier_set_role(&barrier, BARRIER_PARENT);
1209
1210 /* If the child was forked off successfully it will do all the
1211 * cleanups, so forget about the handle here. */
1212 handle = NULL;
1213
1214 /* Unblock SIGTERM again in the parent */
1215 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1216
1217 /* We close the log explicitly here, since the PAM modules
1218 * might have opened it, but we don't want this fd around. */
1219 closelog();
1220
1221 /* Synchronously wait for the child to initialize. We don't care for
1222 * errors as we cannot recover. However, warn loudly if it happens. */
1223 if (!barrier_place_and_sync(&barrier))
1224 log_error("PAM initialization failed");
1225
1226 strv_free(*env);
1227 *env = e;
1228
1229 return 0;
1230
1231 fail:
1232 if (pam_code != PAM_SUCCESS) {
1233 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1234 r = -EPERM; /* PAM errors do not map to errno */
1235 } else
1236 log_error_errno(r, "PAM failed: %m");
1237
1238 if (handle) {
1239 if (close_session)
1240 pam_code = pam_close_session(handle, flags);
1241
1242 pam_end(handle, pam_code | flags);
1243 }
1244
1245 strv_free(e);
1246 closelog();
1247
1248 return r;
1249 #else
1250 return 0;
1251 #endif
1252 }
1253
1254 static void rename_process_from_path(const char *path) {
1255 char process_name[11];
1256 const char *p;
1257 size_t l;
1258
1259 /* This resulting string must fit in 10 chars (i.e. the length
1260 * of "/sbin/init") to look pretty in /bin/ps */
1261
1262 p = basename(path);
1263 if (isempty(p)) {
1264 rename_process("(...)");
1265 return;
1266 }
1267
1268 l = strlen(p);
1269 if (l > 8) {
1270 /* The end of the process name is usually more
1271 * interesting, since the first bit might just be
1272 * "systemd-" */
1273 p = p + l - 8;
1274 l = 8;
1275 }
1276
1277 process_name[0] = '(';
1278 memcpy(process_name+1, p, l);
1279 process_name[1+l] = ')';
1280 process_name[1+l+1] = 0;
1281
1282 rename_process(process_name);
1283 }
1284
1285 static bool context_has_address_families(const ExecContext *c) {
1286 assert(c);
1287
1288 return c->address_families_whitelist ||
1289 !set_isempty(c->address_families);
1290 }
1291
1292 static bool context_has_syscall_filters(const ExecContext *c) {
1293 assert(c);
1294
1295 return c->syscall_whitelist ||
1296 !set_isempty(c->syscall_filter);
1297 }
1298
1299 static bool context_has_no_new_privileges(const ExecContext *c) {
1300 assert(c);
1301
1302 if (c->no_new_privileges)
1303 return true;
1304
1305 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1306 return false;
1307
1308 /* We need NNP if we have any form of seccomp and are unprivileged */
1309 return context_has_address_families(c) ||
1310 c->memory_deny_write_execute ||
1311 c->restrict_realtime ||
1312 exec_context_restrict_namespaces_set(c) ||
1313 c->protect_kernel_tunables ||
1314 c->protect_kernel_modules ||
1315 c->private_devices ||
1316 context_has_syscall_filters(c) ||
1317 !set_isempty(c->syscall_archs) ||
1318 c->lock_personality;
1319 }
1320
1321 #ifdef HAVE_SECCOMP
1322
1323 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1324
1325 if (is_seccomp_available())
1326 return false;
1327
1328 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1329 return true;
1330 }
1331
1332 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1333 uint32_t negative_action, default_action, action;
1334 int r;
1335
1336 assert(u);
1337 assert(c);
1338
1339 if (!context_has_syscall_filters(c))
1340 return 0;
1341
1342 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1343 return 0;
1344
1345 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1346
1347 if (c->syscall_whitelist) {
1348 default_action = negative_action;
1349 action = SCMP_ACT_ALLOW;
1350 } else {
1351 default_action = SCMP_ACT_ALLOW;
1352 action = negative_action;
1353 }
1354
1355 if (needs_ambient_hack) {
1356 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1357 if (r < 0)
1358 return r;
1359 }
1360
1361 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1362 }
1363
1364 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1365 assert(u);
1366 assert(c);
1367
1368 if (set_isempty(c->syscall_archs))
1369 return 0;
1370
1371 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1372 return 0;
1373
1374 return seccomp_restrict_archs(c->syscall_archs);
1375 }
1376
1377 static int apply_address_families(const Unit* u, const ExecContext *c) {
1378 assert(u);
1379 assert(c);
1380
1381 if (!context_has_address_families(c))
1382 return 0;
1383
1384 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1385 return 0;
1386
1387 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1388 }
1389
1390 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1391 assert(u);
1392 assert(c);
1393
1394 if (!c->memory_deny_write_execute)
1395 return 0;
1396
1397 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1398 return 0;
1399
1400 return seccomp_memory_deny_write_execute();
1401 }
1402
1403 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1404 assert(u);
1405 assert(c);
1406
1407 if (!c->restrict_realtime)
1408 return 0;
1409
1410 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1411 return 0;
1412
1413 return seccomp_restrict_realtime();
1414 }
1415
1416 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1417 assert(u);
1418 assert(c);
1419
1420 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1421 * let's protect even those systems where this is left on in the kernel. */
1422
1423 if (!c->protect_kernel_tunables)
1424 return 0;
1425
1426 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1427 return 0;
1428
1429 return seccomp_protect_sysctl();
1430 }
1431
1432 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1433 assert(u);
1434 assert(c);
1435
1436 /* Turn off module syscalls on ProtectKernelModules=yes */
1437
1438 if (!c->protect_kernel_modules)
1439 return 0;
1440
1441 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1442 return 0;
1443
1444 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1445 }
1446
1447 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1448 assert(u);
1449 assert(c);
1450
1451 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1452
1453 if (!c->private_devices)
1454 return 0;
1455
1456 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1457 return 0;
1458
1459 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1460 }
1461
1462 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1463 assert(u);
1464 assert(c);
1465
1466 if (!exec_context_restrict_namespaces_set(c))
1467 return 0;
1468
1469 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1470 return 0;
1471
1472 return seccomp_restrict_namespaces(c->restrict_namespaces);
1473 }
1474
1475 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1476 unsigned long personality;
1477 int r;
1478
1479 assert(u);
1480 assert(c);
1481
1482 if (!c->lock_personality)
1483 return 0;
1484
1485 if (skip_seccomp_unavailable(u, "LockPersonality="))
1486 return 0;
1487
1488 personality = c->personality;
1489
1490 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1491 if (personality == PERSONALITY_INVALID) {
1492
1493 r = opinionated_personality(&personality);
1494 if (r < 0)
1495 return r;
1496 }
1497
1498 return seccomp_lock_personality(personality);
1499 }
1500
1501 #endif
1502
1503 static void do_idle_pipe_dance(int idle_pipe[4]) {
1504 assert(idle_pipe);
1505
1506 idle_pipe[1] = safe_close(idle_pipe[1]);
1507 idle_pipe[2] = safe_close(idle_pipe[2]);
1508
1509 if (idle_pipe[0] >= 0) {
1510 int r;
1511
1512 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1513
1514 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1515 ssize_t n;
1516
1517 /* Signal systemd that we are bored and want to continue. */
1518 n = write(idle_pipe[3], "x", 1);
1519 if (n > 0)
1520 /* Wait for systemd to react to the signal above. */
1521 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1522 }
1523
1524 idle_pipe[0] = safe_close(idle_pipe[0]);
1525
1526 }
1527
1528 idle_pipe[3] = safe_close(idle_pipe[3]);
1529 }
1530
1531 static int build_environment(
1532 Unit *u,
1533 const ExecContext *c,
1534 const ExecParameters *p,
1535 unsigned n_fds,
1536 const char *home,
1537 const char *username,
1538 const char *shell,
1539 dev_t journal_stream_dev,
1540 ino_t journal_stream_ino,
1541 char ***ret) {
1542
1543 _cleanup_strv_free_ char **our_env = NULL;
1544 unsigned n_env = 0;
1545 char *x;
1546
1547 assert(u);
1548 assert(c);
1549 assert(ret);
1550
1551 our_env = new0(char*, 14);
1552 if (!our_env)
1553 return -ENOMEM;
1554
1555 if (n_fds > 0) {
1556 _cleanup_free_ char *joined = NULL;
1557
1558 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1559 return -ENOMEM;
1560 our_env[n_env++] = x;
1561
1562 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1563 return -ENOMEM;
1564 our_env[n_env++] = x;
1565
1566 joined = strv_join(p->fd_names, ":");
1567 if (!joined)
1568 return -ENOMEM;
1569
1570 x = strjoin("LISTEN_FDNAMES=", joined);
1571 if (!x)
1572 return -ENOMEM;
1573 our_env[n_env++] = x;
1574 }
1575
1576 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1577 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1578 return -ENOMEM;
1579 our_env[n_env++] = x;
1580
1581 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1582 return -ENOMEM;
1583 our_env[n_env++] = x;
1584 }
1585
1586 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1587 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1588 * check the database directly. */
1589 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1590 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1591 if (!x)
1592 return -ENOMEM;
1593 our_env[n_env++] = x;
1594 }
1595
1596 if (home) {
1597 x = strappend("HOME=", home);
1598 if (!x)
1599 return -ENOMEM;
1600 our_env[n_env++] = x;
1601 }
1602
1603 if (username) {
1604 x = strappend("LOGNAME=", username);
1605 if (!x)
1606 return -ENOMEM;
1607 our_env[n_env++] = x;
1608
1609 x = strappend("USER=", username);
1610 if (!x)
1611 return -ENOMEM;
1612 our_env[n_env++] = x;
1613 }
1614
1615 if (shell) {
1616 x = strappend("SHELL=", shell);
1617 if (!x)
1618 return -ENOMEM;
1619 our_env[n_env++] = x;
1620 }
1621
1622 if (!sd_id128_is_null(u->invocation_id)) {
1623 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1624 return -ENOMEM;
1625
1626 our_env[n_env++] = x;
1627 }
1628
1629 if (exec_context_needs_term(c)) {
1630 const char *tty_path, *term = NULL;
1631
1632 tty_path = exec_context_tty_path(c);
1633
1634 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1635 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1636 * passes to PID 1 ends up all the way in the console login shown. */
1637
1638 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1639 term = getenv("TERM");
1640 if (!term)
1641 term = default_term_for_tty(tty_path);
1642
1643 x = strappend("TERM=", term);
1644 if (!x)
1645 return -ENOMEM;
1646 our_env[n_env++] = x;
1647 }
1648
1649 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1650 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1651 return -ENOMEM;
1652
1653 our_env[n_env++] = x;
1654 }
1655
1656 our_env[n_env++] = NULL;
1657 assert(n_env <= 12);
1658
1659 *ret = our_env;
1660 our_env = NULL;
1661
1662 return 0;
1663 }
1664
1665 static int build_pass_environment(const ExecContext *c, char ***ret) {
1666 _cleanup_strv_free_ char **pass_env = NULL;
1667 size_t n_env = 0, n_bufsize = 0;
1668 char **i;
1669
1670 STRV_FOREACH(i, c->pass_environment) {
1671 _cleanup_free_ char *x = NULL;
1672 char *v;
1673
1674 v = getenv(*i);
1675 if (!v)
1676 continue;
1677 x = strjoin(*i, "=", v);
1678 if (!x)
1679 return -ENOMEM;
1680
1681 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1682 return -ENOMEM;
1683
1684 pass_env[n_env++] = x;
1685 pass_env[n_env] = NULL;
1686 x = NULL;
1687 }
1688
1689 *ret = pass_env;
1690 pass_env = NULL;
1691
1692 return 0;
1693 }
1694
1695 static bool exec_needs_mount_namespace(
1696 const ExecContext *context,
1697 const ExecParameters *params,
1698 ExecRuntime *runtime) {
1699
1700 assert(context);
1701 assert(params);
1702
1703 if (context->root_image)
1704 return true;
1705
1706 if (!strv_isempty(context->read_write_paths) ||
1707 !strv_isempty(context->read_only_paths) ||
1708 !strv_isempty(context->inaccessible_paths))
1709 return true;
1710
1711 if (context->n_bind_mounts > 0)
1712 return true;
1713
1714 if (context->mount_flags != 0)
1715 return true;
1716
1717 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1718 return true;
1719
1720 if (context->private_devices ||
1721 context->protect_system != PROTECT_SYSTEM_NO ||
1722 context->protect_home != PROTECT_HOME_NO ||
1723 context->protect_kernel_tunables ||
1724 context->protect_kernel_modules ||
1725 context->protect_control_groups)
1726 return true;
1727
1728 if (context->mount_apivfs && (context->root_image || context->root_directory))
1729 return true;
1730
1731 return false;
1732 }
1733
1734 static int setup_private_users(uid_t uid, gid_t gid) {
1735 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1736 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1737 _cleanup_close_ int unshare_ready_fd = -1;
1738 _cleanup_(sigkill_waitp) pid_t pid = 0;
1739 uint64_t c = 1;
1740 siginfo_t si;
1741 ssize_t n;
1742 int r;
1743
1744 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1745 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1746 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1747 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1748 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1749 * continues execution normally. */
1750
1751 if (uid != 0 && uid_is_valid(uid)) {
1752 r = asprintf(&uid_map,
1753 "0 0 1\n" /* Map root → root */
1754 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1755 uid, uid);
1756 if (r < 0)
1757 return -ENOMEM;
1758 } else {
1759 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1760 if (!uid_map)
1761 return -ENOMEM;
1762 }
1763
1764 if (gid != 0 && gid_is_valid(gid)) {
1765 r = asprintf(&gid_map,
1766 "0 0 1\n" /* Map root → root */
1767 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1768 gid, gid);
1769 if (r < 0)
1770 return -ENOMEM;
1771 } else {
1772 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1773 if (!gid_map)
1774 return -ENOMEM;
1775 }
1776
1777 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1778 * namespace. */
1779 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1780 if (unshare_ready_fd < 0)
1781 return -errno;
1782
1783 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1784 * failed. */
1785 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1786 return -errno;
1787
1788 pid = fork();
1789 if (pid < 0)
1790 return -errno;
1791
1792 if (pid == 0) {
1793 _cleanup_close_ int fd = -1;
1794 const char *a;
1795 pid_t ppid;
1796
1797 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1798 * here, after the parent opened its own user namespace. */
1799
1800 ppid = getppid();
1801 errno_pipe[0] = safe_close(errno_pipe[0]);
1802
1803 /* Wait until the parent unshared the user namespace */
1804 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1805 r = -errno;
1806 goto child_fail;
1807 }
1808
1809 /* Disable the setgroups() system call in the child user namespace, for good. */
1810 a = procfs_file_alloca(ppid, "setgroups");
1811 fd = open(a, O_WRONLY|O_CLOEXEC);
1812 if (fd < 0) {
1813 if (errno != ENOENT) {
1814 r = -errno;
1815 goto child_fail;
1816 }
1817
1818 /* If the file is missing the kernel is too old, let's continue anyway. */
1819 } else {
1820 if (write(fd, "deny\n", 5) < 0) {
1821 r = -errno;
1822 goto child_fail;
1823 }
1824
1825 fd = safe_close(fd);
1826 }
1827
1828 /* First write the GID map */
1829 a = procfs_file_alloca(ppid, "gid_map");
1830 fd = open(a, O_WRONLY|O_CLOEXEC);
1831 if (fd < 0) {
1832 r = -errno;
1833 goto child_fail;
1834 }
1835 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1836 r = -errno;
1837 goto child_fail;
1838 }
1839 fd = safe_close(fd);
1840
1841 /* The write the UID map */
1842 a = procfs_file_alloca(ppid, "uid_map");
1843 fd = open(a, O_WRONLY|O_CLOEXEC);
1844 if (fd < 0) {
1845 r = -errno;
1846 goto child_fail;
1847 }
1848 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1849 r = -errno;
1850 goto child_fail;
1851 }
1852
1853 _exit(EXIT_SUCCESS);
1854
1855 child_fail:
1856 (void) write(errno_pipe[1], &r, sizeof(r));
1857 _exit(EXIT_FAILURE);
1858 }
1859
1860 errno_pipe[1] = safe_close(errno_pipe[1]);
1861
1862 if (unshare(CLONE_NEWUSER) < 0)
1863 return -errno;
1864
1865 /* Let the child know that the namespace is ready now */
1866 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1867 return -errno;
1868
1869 /* Try to read an error code from the child */
1870 n = read(errno_pipe[0], &r, sizeof(r));
1871 if (n < 0)
1872 return -errno;
1873 if (n == sizeof(r)) { /* an error code was sent to us */
1874 if (r < 0)
1875 return r;
1876 return -EIO;
1877 }
1878 if (n != 0) /* on success we should have read 0 bytes */
1879 return -EIO;
1880
1881 r = wait_for_terminate(pid, &si);
1882 if (r < 0)
1883 return r;
1884 pid = 0;
1885
1886 /* If something strange happened with the child, let's consider this fatal, too */
1887 if (si.si_code != CLD_EXITED || si.si_status != 0)
1888 return -EIO;
1889
1890 return 0;
1891 }
1892
1893 static int setup_exec_directory(
1894 const ExecContext *context,
1895 const ExecParameters *params,
1896 uid_t uid,
1897 gid_t gid,
1898 ExecDirectoryType type,
1899 int *exit_status) {
1900
1901 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1902 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1903 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1904 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1905 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1906 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1907 };
1908 char **rt;
1909 int r;
1910
1911 assert(context);
1912 assert(params);
1913 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1914 assert(exit_status);
1915
1916 if (!params->prefix[type])
1917 return 0;
1918
1919 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1920 if (!uid_is_valid(uid))
1921 uid = 0;
1922 if (!gid_is_valid(gid))
1923 gid = 0;
1924 }
1925
1926 STRV_FOREACH(rt, context->directories[type].paths) {
1927 _cleanup_free_ char *p;
1928
1929 p = strjoin(params->prefix[type], "/", *rt);
1930 if (!p) {
1931 r = -ENOMEM;
1932 goto fail;
1933 }
1934
1935 r = mkdir_parents_label(p, 0755);
1936 if (r < 0)
1937 goto fail;
1938
1939 r = mkdir_label(p, context->directories[type].mode);
1940 if (r < 0 && r != -EEXIST)
1941 goto fail;
1942
1943 /* First lock down the access mode */
1944 if (chmod(p, context->directories[type].mode) < 0) {
1945 r = -errno;
1946 goto fail;
1947 }
1948
1949 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
1950 * a service, and shall not be writable. */
1951 if (type == EXEC_DIRECTORY_CONFIGURATION)
1952 continue;
1953
1954 /* Then, change the ownership of the whole tree, if necessary */
1955 r = path_chown_recursive(p, uid, gid);
1956 if (r < 0)
1957 goto fail;
1958 }
1959
1960 return 0;
1961
1962 fail:
1963 *exit_status = exit_status_table[type];
1964 return r;
1965 }
1966
1967 static int setup_smack(
1968 const ExecContext *context,
1969 const ExecCommand *command) {
1970
1971 int r;
1972
1973 assert(context);
1974 assert(command);
1975
1976 if (context->smack_process_label) {
1977 r = mac_smack_apply_pid(0, context->smack_process_label);
1978 if (r < 0)
1979 return r;
1980 }
1981 #ifdef SMACK_DEFAULT_PROCESS_LABEL
1982 else {
1983 _cleanup_free_ char *exec_label = NULL;
1984
1985 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
1986 if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
1987 return r;
1988
1989 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
1990 if (r < 0)
1991 return r;
1992 }
1993 #endif
1994
1995 return 0;
1996 }
1997
1998 static int compile_read_write_paths(
1999 const ExecContext *context,
2000 const ExecParameters *params,
2001 char ***ret) {
2002
2003 _cleanup_strv_free_ char **l = NULL;
2004 char **rt;
2005 ExecDirectoryType i;
2006
2007 /* Compile the list of writable paths. This is the combination of
2008 * the explicitly configured paths, plus all runtime directories. */
2009
2010 if (strv_isempty(context->read_write_paths)) {
2011 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
2012 if (!strv_isempty(context->directories[i].paths))
2013 break;
2014
2015 if (i == _EXEC_DIRECTORY_TYPE_MAX) {
2016 *ret = NULL; /* NOP if neither is set */
2017 return 0;
2018 }
2019 }
2020
2021 l = strv_copy(context->read_write_paths);
2022 if (!l)
2023 return -ENOMEM;
2024
2025 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) {
2026 if (!params->prefix[i])
2027 continue;
2028
2029 STRV_FOREACH(rt, context->directories[i].paths) {
2030 char *s;
2031
2032 s = strjoin(params->prefix[i], "/", *rt);
2033 if (!s)
2034 return -ENOMEM;
2035
2036 if (strv_consume(&l, s) < 0)
2037 return -ENOMEM;
2038 }
2039 }
2040
2041 *ret = l;
2042 l = NULL;
2043
2044 return 0;
2045 }
2046
2047 static int apply_mount_namespace(
2048 Unit *u,
2049 ExecCommand *command,
2050 const ExecContext *context,
2051 const ExecParameters *params,
2052 ExecRuntime *runtime) {
2053
2054 _cleanup_strv_free_ char **rw = NULL;
2055 char *tmp = NULL, *var = NULL;
2056 const char *root_dir = NULL, *root_image = NULL;
2057 NameSpaceInfo ns_info = {
2058 .ignore_protect_paths = false,
2059 .private_dev = context->private_devices,
2060 .protect_control_groups = context->protect_control_groups,
2061 .protect_kernel_tunables = context->protect_kernel_tunables,
2062 .protect_kernel_modules = context->protect_kernel_modules,
2063 .mount_apivfs = context->mount_apivfs,
2064 };
2065 bool needs_sandboxing;
2066 int r;
2067
2068 assert(context);
2069
2070 /* The runtime struct only contains the parent of the private /tmp,
2071 * which is non-accessible to world users. Inside of it there's a /tmp
2072 * that is sticky, and that's the one we want to use here. */
2073
2074 if (context->private_tmp && runtime) {
2075 if (runtime->tmp_dir)
2076 tmp = strjoina(runtime->tmp_dir, "/tmp");
2077 if (runtime->var_tmp_dir)
2078 var = strjoina(runtime->var_tmp_dir, "/tmp");
2079 }
2080
2081 r = compile_read_write_paths(context, params, &rw);
2082 if (r < 0)
2083 return r;
2084
2085 if (params->flags & EXEC_APPLY_CHROOT) {
2086 root_image = context->root_image;
2087
2088 if (!root_image)
2089 root_dir = context->root_directory;
2090 }
2091
2092 /*
2093 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2094 * sandbox info, otherwise enforce it, don't ignore protected paths and
2095 * fail if we are enable to apply the sandbox inside the mount namespace.
2096 */
2097 if (!context->dynamic_user && root_dir)
2098 ns_info.ignore_protect_paths = true;
2099
2100 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2101
2102 r = setup_namespace(root_dir, root_image,
2103 &ns_info, rw,
2104 needs_sandboxing ? context->read_only_paths : NULL,
2105 needs_sandboxing ? context->inaccessible_paths : NULL,
2106 context->bind_mounts,
2107 context->n_bind_mounts,
2108 tmp,
2109 var,
2110 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2111 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2112 context->mount_flags,
2113 DISSECT_IMAGE_DISCARD_ON_LOOP);
2114
2115 /* If we couldn't set up the namespace this is probably due to a
2116 * missing capability. In this case, silently proceeed. */
2117 if (IN_SET(r, -EPERM, -EACCES)) {
2118 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2119 return 0;
2120 }
2121
2122 return r;
2123 }
2124
2125 static int apply_working_directory(
2126 const ExecContext *context,
2127 const ExecParameters *params,
2128 const char *home,
2129 const bool needs_mount_ns,
2130 int *exit_status) {
2131
2132 const char *d, *wd;
2133
2134 assert(context);
2135 assert(exit_status);
2136
2137 if (context->working_directory_home) {
2138
2139 if (!home) {
2140 *exit_status = EXIT_CHDIR;
2141 return -ENXIO;
2142 }
2143
2144 wd = home;
2145
2146 } else if (context->working_directory)
2147 wd = context->working_directory;
2148 else
2149 wd = "/";
2150
2151 if (params->flags & EXEC_APPLY_CHROOT) {
2152 if (!needs_mount_ns && context->root_directory)
2153 if (chroot(context->root_directory) < 0) {
2154 *exit_status = EXIT_CHROOT;
2155 return -errno;
2156 }
2157
2158 d = wd;
2159 } else
2160 d = prefix_roota(context->root_directory, wd);
2161
2162 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2163 *exit_status = EXIT_CHDIR;
2164 return -errno;
2165 }
2166
2167 return 0;
2168 }
2169
2170 static int setup_keyring(
2171 Unit *u,
2172 const ExecContext *context,
2173 const ExecParameters *p,
2174 uid_t uid, gid_t gid) {
2175
2176 key_serial_t keyring;
2177 int r;
2178
2179 assert(u);
2180 assert(context);
2181 assert(p);
2182
2183 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2184 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2185 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2186 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2187 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2188 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2189
2190 if (!(p->flags & EXEC_NEW_KEYRING))
2191 return 0;
2192
2193 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2194 return 0;
2195
2196 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2197 if (keyring == -1) {
2198 if (errno == ENOSYS)
2199 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2200 else if (IN_SET(errno, EACCES, EPERM))
2201 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2202 else if (errno == EDQUOT)
2203 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2204 else
2205 return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2206
2207 return 0;
2208 }
2209
2210 /* Populate they keyring with the invocation ID by default. */
2211 if (!sd_id128_is_null(u->invocation_id)) {
2212 key_serial_t key;
2213
2214 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2215 if (key == -1)
2216 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2217 else {
2218 if (keyctl(KEYCTL_SETPERM, key,
2219 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2220 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2221 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2222 }
2223 }
2224
2225 /* And now, make the keyring owned by the service's user */
2226 if (uid_is_valid(uid) || gid_is_valid(gid))
2227 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2228 return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2229
2230 /* When requested link the user keyring into the session keyring. */
2231 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2232 uid_t saved_uid;
2233 gid_t saved_gid;
2234
2235 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2236 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2237 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2238
2239 saved_uid = getuid();
2240 saved_gid = getgid();
2241
2242 if (gid_is_valid(gid) && gid != saved_gid) {
2243 if (setregid(gid, -1) < 0)
2244 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2245 }
2246
2247 if (uid_is_valid(uid) && uid != saved_uid) {
2248 if (setreuid(uid, -1) < 0) {
2249 (void) setregid(saved_gid, -1);
2250 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2251 }
2252 }
2253
2254 if (keyctl(KEYCTL_LINK,
2255 KEY_SPEC_USER_KEYRING,
2256 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2257
2258 r = -errno;
2259
2260 (void) setreuid(saved_uid, -1);
2261 (void) setregid(saved_gid, -1);
2262
2263 return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2264 }
2265
2266 if (uid_is_valid(uid) && uid != saved_uid) {
2267 if (setreuid(saved_uid, -1) < 0) {
2268 (void) setregid(saved_gid, -1);
2269 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2270 }
2271 }
2272
2273 if (gid_is_valid(gid) && gid != saved_gid) {
2274 if (setregid(saved_gid, -1) < 0)
2275 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2276 }
2277 }
2278
2279 return 0;
2280 }
2281
2282 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2283 assert(array);
2284 assert(n);
2285
2286 if (!pair)
2287 return;
2288
2289 if (pair[0] >= 0)
2290 array[(*n)++] = pair[0];
2291 if (pair[1] >= 0)
2292 array[(*n)++] = pair[1];
2293 }
2294
2295 static int close_remaining_fds(
2296 const ExecParameters *params,
2297 ExecRuntime *runtime,
2298 DynamicCreds *dcreds,
2299 int user_lookup_fd,
2300 int socket_fd,
2301 int *fds, unsigned n_fds) {
2302
2303 unsigned n_dont_close = 0;
2304 int dont_close[n_fds + 12];
2305
2306 assert(params);
2307
2308 if (params->stdin_fd >= 0)
2309 dont_close[n_dont_close++] = params->stdin_fd;
2310 if (params->stdout_fd >= 0)
2311 dont_close[n_dont_close++] = params->stdout_fd;
2312 if (params->stderr_fd >= 0)
2313 dont_close[n_dont_close++] = params->stderr_fd;
2314
2315 if (socket_fd >= 0)
2316 dont_close[n_dont_close++] = socket_fd;
2317 if (n_fds > 0) {
2318 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2319 n_dont_close += n_fds;
2320 }
2321
2322 if (runtime)
2323 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2324
2325 if (dcreds) {
2326 if (dcreds->user)
2327 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2328 if (dcreds->group)
2329 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2330 }
2331
2332 if (user_lookup_fd >= 0)
2333 dont_close[n_dont_close++] = user_lookup_fd;
2334
2335 return close_all_fds(dont_close, n_dont_close);
2336 }
2337
2338 static int send_user_lookup(
2339 Unit *unit,
2340 int user_lookup_fd,
2341 uid_t uid,
2342 gid_t gid) {
2343
2344 assert(unit);
2345
2346 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2347 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2348 * specified. */
2349
2350 if (user_lookup_fd < 0)
2351 return 0;
2352
2353 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2354 return 0;
2355
2356 if (writev(user_lookup_fd,
2357 (struct iovec[]) {
2358 IOVEC_INIT(&uid, sizeof(uid)),
2359 IOVEC_INIT(&gid, sizeof(gid)),
2360 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2361 return -errno;
2362
2363 return 0;
2364 }
2365
2366 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2367 int r;
2368
2369 assert(c);
2370 assert(home);
2371 assert(buf);
2372
2373 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2374
2375 if (*home)
2376 return 0;
2377
2378 if (!c->working_directory_home)
2379 return 0;
2380
2381 if (uid == 0) {
2382 /* Hardcode /root as home directory for UID 0 */
2383 *home = "/root";
2384 return 1;
2385 }
2386
2387 r = get_home_dir(buf);
2388 if (r < 0)
2389 return r;
2390
2391 *home = *buf;
2392 return 1;
2393 }
2394
2395 static int exec_child(
2396 Unit *unit,
2397 ExecCommand *command,
2398 const ExecContext *context,
2399 const ExecParameters *params,
2400 ExecRuntime *runtime,
2401 DynamicCreds *dcreds,
2402 char **argv,
2403 int socket_fd,
2404 int named_iofds[3],
2405 int *fds,
2406 unsigned n_storage_fds,
2407 unsigned n_socket_fds,
2408 char **files_env,
2409 int user_lookup_fd,
2410 int *exit_status) {
2411
2412 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2413 _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2414 _cleanup_free_ gid_t *supplementary_gids = NULL;
2415 const char *username = NULL, *groupname = NULL;
2416 const char *home = NULL, *shell = NULL;
2417 dev_t journal_stream_dev = 0;
2418 ino_t journal_stream_ino = 0;
2419 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2420 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2421 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2422 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2423 #ifdef HAVE_SELINUX
2424 bool use_selinux = false;
2425 #endif
2426 #ifdef HAVE_SMACK
2427 bool use_smack = false;
2428 #endif
2429 #ifdef HAVE_APPARMOR
2430 bool use_apparmor = false;
2431 #endif
2432 uid_t uid = UID_INVALID;
2433 gid_t gid = GID_INVALID;
2434 int i, r, ngids = 0;
2435 unsigned n_fds;
2436 ExecDirectoryType dt;
2437 int secure_bits;
2438
2439 assert(unit);
2440 assert(command);
2441 assert(context);
2442 assert(params);
2443 assert(exit_status);
2444
2445 rename_process_from_path(command->path);
2446
2447 /* We reset exactly these signals, since they are the
2448 * only ones we set to SIG_IGN in the main daemon. All
2449 * others we leave untouched because we set them to
2450 * SIG_DFL or a valid handler initially, both of which
2451 * will be demoted to SIG_DFL. */
2452 (void) default_signals(SIGNALS_CRASH_HANDLER,
2453 SIGNALS_IGNORE, -1);
2454
2455 if (context->ignore_sigpipe)
2456 (void) ignore_signals(SIGPIPE, -1);
2457
2458 r = reset_signal_mask();
2459 if (r < 0) {
2460 *exit_status = EXIT_SIGNAL_MASK;
2461 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2462 }
2463
2464 if (params->idle_pipe)
2465 do_idle_pipe_dance(params->idle_pipe);
2466
2467 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2468 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2469 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2470 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2471
2472 log_forget_fds();
2473 log_set_open_when_needed(true);
2474
2475 /* In case anything used libc syslog(), close this here, too */
2476 closelog();
2477
2478 n_fds = n_storage_fds + n_socket_fds;
2479 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2480 if (r < 0) {
2481 *exit_status = EXIT_FDS;
2482 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2483 }
2484
2485 if (!context->same_pgrp)
2486 if (setsid() < 0) {
2487 *exit_status = EXIT_SETSID;
2488 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2489 }
2490
2491 exec_context_tty_reset(context, params);
2492
2493 if (unit_shall_confirm_spawn(unit)) {
2494 const char *vc = params->confirm_spawn;
2495 _cleanup_free_ char *cmdline = NULL;
2496
2497 cmdline = exec_command_line(argv);
2498 if (!cmdline) {
2499 *exit_status = EXIT_MEMORY;
2500 return log_oom();
2501 }
2502
2503 r = ask_for_confirmation(vc, unit, cmdline);
2504 if (r != CONFIRM_EXECUTE) {
2505 if (r == CONFIRM_PRETEND_SUCCESS) {
2506 *exit_status = EXIT_SUCCESS;
2507 return 0;
2508 }
2509 *exit_status = EXIT_CONFIRM;
2510 log_unit_error(unit, "Execution cancelled by the user");
2511 return -ECANCELED;
2512 }
2513 }
2514
2515 if (context->dynamic_user && dcreds) {
2516
2517 /* Make sure we bypass our own NSS module for any NSS checks */
2518 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2519 *exit_status = EXIT_USER;
2520 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2521 }
2522
2523 r = dynamic_creds_realize(dcreds, &uid, &gid);
2524 if (r < 0) {
2525 *exit_status = EXIT_USER;
2526 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2527 }
2528
2529 if (!uid_is_valid(uid)) {
2530 *exit_status = EXIT_USER;
2531 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2532 return -ESRCH;
2533 }
2534
2535 if (!gid_is_valid(gid)) {
2536 *exit_status = EXIT_USER;
2537 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2538 return -ESRCH;
2539 }
2540
2541 if (dcreds->user)
2542 username = dcreds->user->name;
2543
2544 } else {
2545 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2546 if (r < 0) {
2547 *exit_status = EXIT_USER;
2548 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2549 }
2550
2551 r = get_fixed_group(context, &groupname, &gid);
2552 if (r < 0) {
2553 *exit_status = EXIT_GROUP;
2554 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2555 }
2556 }
2557
2558 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2559 r = get_supplementary_groups(context, username, groupname, gid,
2560 &supplementary_gids, &ngids);
2561 if (r < 0) {
2562 *exit_status = EXIT_GROUP;
2563 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2564 }
2565
2566 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2567 if (r < 0) {
2568 *exit_status = EXIT_USER;
2569 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2570 }
2571
2572 user_lookup_fd = safe_close(user_lookup_fd);
2573
2574 r = acquire_home(context, uid, &home, &home_buffer);
2575 if (r < 0) {
2576 *exit_status = EXIT_CHDIR;
2577 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2578 }
2579
2580 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2581 * must sure to drop O_NONBLOCK */
2582 if (socket_fd >= 0)
2583 (void) fd_nonblock(socket_fd, false);
2584
2585 r = setup_input(context, params, socket_fd, named_iofds);
2586 if (r < 0) {
2587 *exit_status = EXIT_STDIN;
2588 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2589 }
2590
2591 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2592 if (r < 0) {
2593 *exit_status = EXIT_STDOUT;
2594 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2595 }
2596
2597 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2598 if (r < 0) {
2599 *exit_status = EXIT_STDERR;
2600 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2601 }
2602
2603 if (params->cgroup_path) {
2604 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2605 if (r < 0) {
2606 *exit_status = EXIT_CGROUP;
2607 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2608 }
2609 }
2610
2611 if (context->oom_score_adjust_set) {
2612 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2613
2614 /* When we can't make this change due to EPERM, then
2615 * let's silently skip over it. User namespaces
2616 * prohibit write access to this file, and we
2617 * shouldn't trip up over that. */
2618
2619 sprintf(t, "%i", context->oom_score_adjust);
2620 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2621 if (IN_SET(r, -EPERM, -EACCES))
2622 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2623 else if (r < 0) {
2624 *exit_status = EXIT_OOM_ADJUST;
2625 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2626 }
2627 }
2628
2629 if (context->nice_set)
2630 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2631 *exit_status = EXIT_NICE;
2632 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2633 }
2634
2635 if (context->cpu_sched_set) {
2636 struct sched_param param = {
2637 .sched_priority = context->cpu_sched_priority,
2638 };
2639
2640 r = sched_setscheduler(0,
2641 context->cpu_sched_policy |
2642 (context->cpu_sched_reset_on_fork ?
2643 SCHED_RESET_ON_FORK : 0),
2644 &param);
2645 if (r < 0) {
2646 *exit_status = EXIT_SETSCHEDULER;
2647 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2648 }
2649 }
2650
2651 if (context->cpuset)
2652 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2653 *exit_status = EXIT_CPUAFFINITY;
2654 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2655 }
2656
2657 if (context->ioprio_set)
2658 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2659 *exit_status = EXIT_IOPRIO;
2660 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2661 }
2662
2663 if (context->timer_slack_nsec != NSEC_INFINITY)
2664 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2665 *exit_status = EXIT_TIMERSLACK;
2666 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2667 }
2668
2669 if (context->personality != PERSONALITY_INVALID) {
2670 r = safe_personality(context->personality);
2671 if (r < 0) {
2672 *exit_status = EXIT_PERSONALITY;
2673 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2674 }
2675 }
2676
2677 if (context->utmp_id)
2678 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2679 context->tty_path,
2680 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
2681 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2682 USER_PROCESS,
2683 username);
2684
2685 if (context->user) {
2686 r = chown_terminal(STDIN_FILENO, uid);
2687 if (r < 0) {
2688 *exit_status = EXIT_STDIN;
2689 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2690 }
2691 }
2692
2693 /* If delegation is enabled we'll pass ownership of the cgroup
2694 * (but only in systemd's own controller hierarchy!) to the
2695 * user of the new process. */
2696 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2697 r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
2698 if (r < 0) {
2699 *exit_status = EXIT_CGROUP;
2700 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2701 }
2702
2703 r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
2704 if (r < 0) {
2705 *exit_status = EXIT_CGROUP;
2706 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2707 }
2708 }
2709
2710 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2711 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
2712 if (r < 0)
2713 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
2714 }
2715
2716 r = build_environment(
2717 unit,
2718 context,
2719 params,
2720 n_fds,
2721 home,
2722 username,
2723 shell,
2724 journal_stream_dev,
2725 journal_stream_ino,
2726 &our_env);
2727 if (r < 0) {
2728 *exit_status = EXIT_MEMORY;
2729 return log_oom();
2730 }
2731
2732 r = build_pass_environment(context, &pass_env);
2733 if (r < 0) {
2734 *exit_status = EXIT_MEMORY;
2735 return log_oom();
2736 }
2737
2738 accum_env = strv_env_merge(5,
2739 params->environment,
2740 our_env,
2741 pass_env,
2742 context->environment,
2743 files_env,
2744 NULL);
2745 if (!accum_env) {
2746 *exit_status = EXIT_MEMORY;
2747 return log_oom();
2748 }
2749 accum_env = strv_env_clean(accum_env);
2750
2751 (void) umask(context->umask);
2752
2753 r = setup_keyring(unit, context, params, uid, gid);
2754 if (r < 0) {
2755 *exit_status = EXIT_KEYRING;
2756 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
2757 }
2758
2759 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2760 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2761
2762 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2763 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
2764
2765 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2766 if (needs_ambient_hack)
2767 needs_setuid = false;
2768 else
2769 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
2770
2771 if (needs_sandboxing) {
2772 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
2773 * present. The actual MAC context application will happen later, as late as possible, to avoid
2774 * impacting our own code paths. */
2775
2776 #ifdef HAVE_SELINUX
2777 use_selinux = mac_selinux_use();
2778 #endif
2779 #ifdef HAVE_SMACK
2780 use_smack = mac_smack_use();
2781 #endif
2782 #ifdef HAVE_APPARMOR
2783 use_apparmor = mac_apparmor_use();
2784 #endif
2785 }
2786
2787 if (needs_setuid) {
2788 if (context->pam_name && username) {
2789 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
2790 if (r < 0) {
2791 *exit_status = EXIT_PAM;
2792 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
2793 }
2794 }
2795 }
2796
2797 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
2798 r = setup_netns(runtime->netns_storage_socket);
2799 if (r < 0) {
2800 *exit_status = EXIT_NETWORK;
2801 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
2802 }
2803 }
2804
2805 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
2806 if (needs_mount_namespace) {
2807 r = apply_mount_namespace(unit, command, context, params, runtime);
2808 if (r < 0) {
2809 *exit_status = EXIT_NAMESPACE;
2810 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
2811 }
2812 }
2813
2814 /* Apply just after mount namespace setup */
2815 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
2816 if (r < 0)
2817 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
2818
2819 /* Drop groups as early as possbile */
2820 if (needs_setuid) {
2821 r = enforce_groups(context, gid, supplementary_gids, ngids);
2822 if (r < 0) {
2823 *exit_status = EXIT_GROUP;
2824 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
2825 }
2826 }
2827
2828 if (needs_sandboxing) {
2829 #ifdef HAVE_SELINUX
2830 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
2831 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
2832 if (r < 0) {
2833 *exit_status = EXIT_SELINUX_CONTEXT;
2834 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
2835 }
2836 }
2837 #endif
2838
2839 if (context->private_users) {
2840 r = setup_private_users(uid, gid);
2841 if (r < 0) {
2842 *exit_status = EXIT_USER;
2843 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
2844 }
2845 }
2846 }
2847
2848 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
2849 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
2850 * was needed to upload the policy and can now be closed as well. */
2851 r = close_all_fds(fds, n_fds);
2852 if (r >= 0)
2853 r = shift_fds(fds, n_fds);
2854 if (r >= 0)
2855 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
2856 if (r < 0) {
2857 *exit_status = EXIT_FDS;
2858 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
2859 }
2860
2861 secure_bits = context->secure_bits;
2862
2863 if (needs_sandboxing) {
2864 uint64_t bset;
2865
2866 for (i = 0; i < _RLIMIT_MAX; i++) {
2867
2868 if (!context->rlimit[i])
2869 continue;
2870
2871 r = setrlimit_closest(i, context->rlimit[i]);
2872 if (r < 0) {
2873 *exit_status = EXIT_LIMITS;
2874 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
2875 }
2876 }
2877
2878 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
2879 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
2880 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
2881 *exit_status = EXIT_LIMITS;
2882 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
2883 }
2884 }
2885
2886 bset = context->capability_bounding_set;
2887 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
2888 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
2889 * instead of us doing that */
2890 if (needs_ambient_hack)
2891 bset |= (UINT64_C(1) << CAP_SETPCAP) |
2892 (UINT64_C(1) << CAP_SETUID) |
2893 (UINT64_C(1) << CAP_SETGID);
2894
2895 if (!cap_test_all(bset)) {
2896 r = capability_bounding_set_drop(bset, false);
2897 if (r < 0) {
2898 *exit_status = EXIT_CAPABILITIES;
2899 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
2900 }
2901 }
2902
2903 /* This is done before enforce_user, but ambient set
2904 * does not survive over setresuid() if keep_caps is not set. */
2905 if (!needs_ambient_hack &&
2906 context->capability_ambient_set != 0) {
2907 r = capability_ambient_set_apply(context->capability_ambient_set, true);
2908 if (r < 0) {
2909 *exit_status = EXIT_CAPABILITIES;
2910 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
2911 }
2912 }
2913 }
2914
2915 if (needs_setuid) {
2916 if (context->user) {
2917 r = enforce_user(context, uid);
2918 if (r < 0) {
2919 *exit_status = EXIT_USER;
2920 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
2921 }
2922
2923 if (!needs_ambient_hack &&
2924 context->capability_ambient_set != 0) {
2925
2926 /* Fix the ambient capabilities after user change. */
2927 r = capability_ambient_set_apply(context->capability_ambient_set, false);
2928 if (r < 0) {
2929 *exit_status = EXIT_CAPABILITIES;
2930 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
2931 }
2932
2933 /* If we were asked to change user and ambient capabilities
2934 * were requested, we had to add keep-caps to the securebits
2935 * so that we would maintain the inherited capability set
2936 * through the setresuid(). Make sure that the bit is added
2937 * also to the context secure_bits so that we don't try to
2938 * drop the bit away next. */
2939
2940 secure_bits |= 1<<SECURE_KEEP_CAPS;
2941 }
2942 }
2943 }
2944
2945 if (needs_sandboxing) {
2946 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
2947 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
2948 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
2949 * are restricted. */
2950
2951 #ifdef HAVE_SELINUX
2952 if (use_selinux) {
2953 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
2954
2955 if (exec_context) {
2956 r = setexeccon(exec_context);
2957 if (r < 0) {
2958 *exit_status = EXIT_SELINUX_CONTEXT;
2959 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
2960 }
2961 }
2962 }
2963 #endif
2964
2965 #ifdef HAVE_SMACK
2966 if (use_smack) {
2967 r = setup_smack(context, command);
2968 if (r < 0) {
2969 *exit_status = EXIT_SMACK_PROCESS_LABEL;
2970 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
2971 }
2972 }
2973 #endif
2974
2975 #ifdef HAVE_APPARMOR
2976 if (use_apparmor && context->apparmor_profile) {
2977 r = aa_change_onexec(context->apparmor_profile);
2978 if (r < 0 && !context->apparmor_profile_ignore) {
2979 *exit_status = EXIT_APPARMOR_PROFILE;
2980 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
2981 }
2982 }
2983 #endif
2984
2985 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
2986 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
2987 if (prctl(PR_GET_SECUREBITS) != secure_bits)
2988 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
2989 *exit_status = EXIT_SECUREBITS;
2990 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
2991 }
2992
2993 if (context_has_no_new_privileges(context))
2994 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
2995 *exit_status = EXIT_NO_NEW_PRIVILEGES;
2996 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
2997 }
2998
2999 #ifdef HAVE_SECCOMP
3000 r = apply_address_families(unit, context);
3001 if (r < 0) {
3002 *exit_status = EXIT_ADDRESS_FAMILIES;
3003 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3004 }
3005
3006 r = apply_memory_deny_write_execute(unit, context);
3007 if (r < 0) {
3008 *exit_status = EXIT_SECCOMP;
3009 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3010 }
3011
3012 r = apply_restrict_realtime(unit, context);
3013 if (r < 0) {
3014 *exit_status = EXIT_SECCOMP;
3015 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3016 }
3017
3018 r = apply_restrict_namespaces(unit, context);
3019 if (r < 0) {
3020 *exit_status = EXIT_SECCOMP;
3021 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3022 }
3023
3024 r = apply_protect_sysctl(unit, context);
3025 if (r < 0) {
3026 *exit_status = EXIT_SECCOMP;
3027 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3028 }
3029
3030 r = apply_protect_kernel_modules(unit, context);
3031 if (r < 0) {
3032 *exit_status = EXIT_SECCOMP;
3033 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3034 }
3035
3036 r = apply_private_devices(unit, context);
3037 if (r < 0) {
3038 *exit_status = EXIT_SECCOMP;
3039 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3040 }
3041
3042 r = apply_syscall_archs(unit, context);
3043 if (r < 0) {
3044 *exit_status = EXIT_SECCOMP;
3045 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3046 }
3047
3048 r = apply_lock_personality(unit, context);
3049 if (r < 0) {
3050 *exit_status = EXIT_SECCOMP;
3051 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3052 }
3053
3054 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3055 * by the filter as little as possible. */
3056 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3057 if (r < 0) {
3058 *exit_status = EXIT_SECCOMP;
3059 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3060 }
3061 #endif
3062 }
3063
3064 if (!strv_isempty(context->unset_environment)) {
3065 char **ee = NULL;
3066
3067 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3068 if (!ee) {
3069 *exit_status = EXIT_MEMORY;
3070 return log_oom();
3071 }
3072
3073 strv_free(accum_env);
3074 accum_env = ee;
3075 }
3076
3077 final_argv = replace_env_argv(argv, accum_env);
3078 if (!final_argv) {
3079 *exit_status = EXIT_MEMORY;
3080 return log_oom();
3081 }
3082
3083 if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3084 _cleanup_free_ char *line;
3085
3086 line = exec_command_line(final_argv);
3087 if (line) {
3088 log_struct(LOG_DEBUG,
3089 "EXECUTABLE=%s", command->path,
3090 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3091 LOG_UNIT_ID(unit),
3092 LOG_UNIT_INVOCATION_ID(unit),
3093 NULL);
3094 }
3095 }
3096
3097 execve(command->path, final_argv, accum_env);
3098
3099 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3100
3101 log_struct_errno(LOG_INFO, errno,
3102 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3103 LOG_UNIT_ID(unit),
3104 LOG_UNIT_INVOCATION_ID(unit),
3105 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3106 command->path),
3107 "EXECUTABLE=%s", command->path,
3108 NULL);
3109
3110 return 0;
3111 }
3112
3113 *exit_status = EXIT_EXEC;
3114 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3115 }
3116
3117 int exec_spawn(Unit *unit,
3118 ExecCommand *command,
3119 const ExecContext *context,
3120 const ExecParameters *params,
3121 ExecRuntime *runtime,
3122 DynamicCreds *dcreds,
3123 pid_t *ret) {
3124
3125 _cleanup_strv_free_ char **files_env = NULL;
3126 int *fds = NULL;
3127 unsigned n_storage_fds = 0, n_socket_fds = 0;
3128 _cleanup_free_ char *line = NULL;
3129 int socket_fd, r;
3130 int named_iofds[3] = { -1, -1, -1 };
3131 char **argv;
3132 pid_t pid;
3133
3134 assert(unit);
3135 assert(command);
3136 assert(context);
3137 assert(ret);
3138 assert(params);
3139 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3140
3141 if (context->std_input == EXEC_INPUT_SOCKET ||
3142 context->std_output == EXEC_OUTPUT_SOCKET ||
3143 context->std_error == EXEC_OUTPUT_SOCKET) {
3144
3145 if (params->n_socket_fds > 1) {
3146 log_unit_error(unit, "Got more than one socket.");
3147 return -EINVAL;
3148 }
3149
3150 if (params->n_socket_fds == 0) {
3151 log_unit_error(unit, "Got no socket.");
3152 return -EINVAL;
3153 }
3154
3155 socket_fd = params->fds[0];
3156 } else {
3157 socket_fd = -1;
3158 fds = params->fds;
3159 n_storage_fds = params->n_storage_fds;
3160 n_socket_fds = params->n_socket_fds;
3161 }
3162
3163 r = exec_context_named_iofds(unit, context, params, named_iofds);
3164 if (r < 0)
3165 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3166
3167 r = exec_context_load_environment(unit, context, &files_env);
3168 if (r < 0)
3169 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3170
3171 argv = params->argv ?: command->argv;
3172 line = exec_command_line(argv);
3173 if (!line)
3174 return log_oom();
3175
3176 log_struct(LOG_DEBUG,
3177 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3178 "EXECUTABLE=%s", command->path,
3179 LOG_UNIT_ID(unit),
3180 LOG_UNIT_INVOCATION_ID(unit),
3181 NULL);
3182
3183 pid = fork();
3184 if (pid < 0)
3185 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3186
3187 if (pid == 0) {
3188 int exit_status = EXIT_SUCCESS;
3189
3190 r = exec_child(unit,
3191 command,
3192 context,
3193 params,
3194 runtime,
3195 dcreds,
3196 argv,
3197 socket_fd,
3198 named_iofds,
3199 fds,
3200 n_storage_fds,
3201 n_socket_fds,
3202 files_env,
3203 unit->manager->user_lookup_fds[1],
3204 &exit_status);
3205
3206 if (r < 0) {
3207 log_struct_errno(LOG_ERR, r,
3208 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3209 LOG_UNIT_ID(unit),
3210 LOG_UNIT_INVOCATION_ID(unit),
3211 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3212 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3213 command->path),
3214 "EXECUTABLE=%s", command->path,
3215 NULL);
3216 }
3217
3218 _exit(exit_status);
3219 }
3220
3221 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3222
3223 /* We add the new process to the cgroup both in the child (so
3224 * that we can be sure that no user code is ever executed
3225 * outside of the cgroup) and in the parent (so that we can be
3226 * sure that when we kill the cgroup the process will be
3227 * killed too). */
3228 if (params->cgroup_path)
3229 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3230
3231 exec_status_start(&command->exec_status, pid);
3232
3233 *ret = pid;
3234 return 0;
3235 }
3236
3237 void exec_context_init(ExecContext *c) {
3238 ExecDirectoryType i;
3239
3240 assert(c);
3241
3242 c->umask = 0022;
3243 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3244 c->cpu_sched_policy = SCHED_OTHER;
3245 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3246 c->syslog_level_prefix = true;
3247 c->ignore_sigpipe = true;
3248 c->timer_slack_nsec = NSEC_INFINITY;
3249 c->personality = PERSONALITY_INVALID;
3250 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3251 c->directories[i].mode = 0755;
3252 c->capability_bounding_set = CAP_ALL;
3253 c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3254 }
3255
3256 void exec_context_done(ExecContext *c) {
3257 unsigned l;
3258 ExecDirectoryType i;
3259
3260 assert(c);
3261
3262 c->environment = strv_free(c->environment);
3263 c->environment_files = strv_free(c->environment_files);
3264 c->pass_environment = strv_free(c->pass_environment);
3265 c->unset_environment = strv_free(c->unset_environment);
3266
3267 for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3268 c->rlimit[l] = mfree(c->rlimit[l]);
3269
3270 for (l = 0; l < 3; l++)
3271 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3272
3273 c->working_directory = mfree(c->working_directory);
3274 c->root_directory = mfree(c->root_directory);
3275 c->root_image = mfree(c->root_image);
3276 c->tty_path = mfree(c->tty_path);
3277 c->syslog_identifier = mfree(c->syslog_identifier);
3278 c->user = mfree(c->user);
3279 c->group = mfree(c->group);
3280
3281 c->supplementary_groups = strv_free(c->supplementary_groups);
3282
3283 c->pam_name = mfree(c->pam_name);
3284
3285 c->read_only_paths = strv_free(c->read_only_paths);
3286 c->read_write_paths = strv_free(c->read_write_paths);
3287 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3288
3289 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3290
3291 if (c->cpuset)
3292 CPU_FREE(c->cpuset);
3293
3294 c->utmp_id = mfree(c->utmp_id);
3295 c->selinux_context = mfree(c->selinux_context);
3296 c->apparmor_profile = mfree(c->apparmor_profile);
3297 c->smack_process_label = mfree(c->smack_process_label);
3298
3299 c->syscall_filter = set_free(c->syscall_filter);
3300 c->syscall_archs = set_free(c->syscall_archs);
3301 c->address_families = set_free(c->address_families);
3302
3303 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3304 c->directories[i].paths = strv_free(c->directories[i].paths);
3305 }
3306
3307 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3308 char **i;
3309
3310 assert(c);
3311
3312 if (!runtime_prefix)
3313 return 0;
3314
3315 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3316 _cleanup_free_ char *p;
3317
3318 p = strjoin(runtime_prefix, "/", *i);
3319 if (!p)
3320 return -ENOMEM;
3321
3322 /* We execute this synchronously, since we need to be
3323 * sure this is gone when we start the service
3324 * next. */
3325 (void) rm_rf(p, REMOVE_ROOT);
3326 }
3327
3328 return 0;
3329 }
3330
3331 void exec_command_done(ExecCommand *c) {
3332 assert(c);
3333
3334 c->path = mfree(c->path);
3335
3336 c->argv = strv_free(c->argv);
3337 }
3338
3339 void exec_command_done_array(ExecCommand *c, unsigned n) {
3340 unsigned i;
3341
3342 for (i = 0; i < n; i++)
3343 exec_command_done(c+i);
3344 }
3345
3346 ExecCommand* exec_command_free_list(ExecCommand *c) {
3347 ExecCommand *i;
3348
3349 while ((i = c)) {
3350 LIST_REMOVE(command, c, i);
3351 exec_command_done(i);
3352 free(i);
3353 }
3354
3355 return NULL;
3356 }
3357
3358 void exec_command_free_array(ExecCommand **c, unsigned n) {
3359 unsigned i;
3360
3361 for (i = 0; i < n; i++)
3362 c[i] = exec_command_free_list(c[i]);
3363 }
3364
3365 typedef struct InvalidEnvInfo {
3366 Unit *unit;
3367 const char *path;
3368 } InvalidEnvInfo;
3369
3370 static void invalid_env(const char *p, void *userdata) {
3371 InvalidEnvInfo *info = userdata;
3372
3373 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3374 }
3375
3376 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3377 assert(c);
3378
3379 switch (fd_index) {
3380 case STDIN_FILENO:
3381 if (c->std_input != EXEC_INPUT_NAMED_FD)
3382 return NULL;
3383 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3384 case STDOUT_FILENO:
3385 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3386 return NULL;
3387 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3388 case STDERR_FILENO:
3389 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3390 return NULL;
3391 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3392 default:
3393 return NULL;
3394 }
3395 }
3396
3397 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3398 unsigned i, targets;
3399 const char* stdio_fdname[3];
3400 unsigned n_fds;
3401
3402 assert(c);
3403 assert(p);
3404
3405 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3406 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3407 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3408
3409 for (i = 0; i < 3; i++)
3410 stdio_fdname[i] = exec_context_fdname(c, i);
3411
3412 n_fds = p->n_storage_fds + p->n_socket_fds;
3413
3414 for (i = 0; i < n_fds && targets > 0; i++)
3415 if (named_iofds[STDIN_FILENO] < 0 &&
3416 c->std_input == EXEC_INPUT_NAMED_FD &&
3417 stdio_fdname[STDIN_FILENO] &&
3418 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3419
3420 named_iofds[STDIN_FILENO] = p->fds[i];
3421 targets--;
3422
3423 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3424 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3425 stdio_fdname[STDOUT_FILENO] &&
3426 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3427
3428 named_iofds[STDOUT_FILENO] = p->fds[i];
3429 targets--;
3430
3431 } else if (named_iofds[STDERR_FILENO] < 0 &&
3432 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3433 stdio_fdname[STDERR_FILENO] &&
3434 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3435
3436 named_iofds[STDERR_FILENO] = p->fds[i];
3437 targets--;
3438 }
3439
3440 return targets == 0 ? 0 : -ENOENT;
3441 }
3442
3443 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3444 char **i, **r = NULL;
3445
3446 assert(c);
3447 assert(l);
3448
3449 STRV_FOREACH(i, c->environment_files) {
3450 char *fn;
3451 int k;
3452 unsigned n;
3453 bool ignore = false;
3454 char **p;
3455 _cleanup_globfree_ glob_t pglob = {};
3456
3457 fn = *i;
3458
3459 if (fn[0] == '-') {
3460 ignore = true;
3461 fn++;
3462 }
3463
3464 if (!path_is_absolute(fn)) {
3465 if (ignore)
3466 continue;
3467
3468 strv_free(r);
3469 return -EINVAL;
3470 }
3471
3472 /* Filename supports globbing, take all matching files */
3473 k = safe_glob(fn, 0, &pglob);
3474 if (k < 0) {
3475 if (ignore)
3476 continue;
3477
3478 strv_free(r);
3479 return k;
3480 }
3481
3482 /* When we don't match anything, -ENOENT should be returned */
3483 assert(pglob.gl_pathc > 0);
3484
3485 for (n = 0; n < pglob.gl_pathc; n++) {
3486 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3487 if (k < 0) {
3488 if (ignore)
3489 continue;
3490
3491 strv_free(r);
3492 return k;
3493 }
3494 /* Log invalid environment variables with filename */
3495 if (p) {
3496 InvalidEnvInfo info = {
3497 .unit = unit,
3498 .path = pglob.gl_pathv[n]
3499 };
3500
3501 p = strv_env_clean_with_callback(p, invalid_env, &info);
3502 }
3503
3504 if (r == NULL)
3505 r = p;
3506 else {
3507 char **m;
3508
3509 m = strv_env_merge(2, r, p);
3510 strv_free(r);
3511 strv_free(p);
3512 if (!m)
3513 return -ENOMEM;
3514
3515 r = m;
3516 }
3517 }
3518 }
3519
3520 *l = r;
3521
3522 return 0;
3523 }
3524
3525 static bool tty_may_match_dev_console(const char *tty) {
3526 _cleanup_free_ char *active = NULL;
3527 char *console;
3528
3529 if (!tty)
3530 return true;
3531
3532 tty = skip_dev_prefix(tty);
3533
3534 /* trivial identity? */
3535 if (streq(tty, "console"))
3536 return true;
3537
3538 console = resolve_dev_console(&active);
3539 /* if we could not resolve, assume it may */
3540 if (!console)
3541 return true;
3542
3543 /* "tty0" means the active VC, so it may be the same sometimes */
3544 return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3545 }
3546
3547 bool exec_context_may_touch_console(ExecContext *ec) {
3548
3549 return (ec->tty_reset ||
3550 ec->tty_vhangup ||
3551 ec->tty_vt_disallocate ||
3552 is_terminal_input(ec->std_input) ||
3553 is_terminal_output(ec->std_output) ||
3554 is_terminal_output(ec->std_error)) &&
3555 tty_may_match_dev_console(exec_context_tty_path(ec));
3556 }
3557
3558 static void strv_fprintf(FILE *f, char **l) {
3559 char **g;
3560
3561 assert(f);
3562
3563 STRV_FOREACH(g, l)
3564 fprintf(f, " %s", *g);
3565 }
3566
3567 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3568 char **e, **d;
3569 unsigned i;
3570 ExecDirectoryType dt;
3571 int r;
3572
3573 assert(c);
3574 assert(f);
3575
3576 prefix = strempty(prefix);
3577
3578 fprintf(f,
3579 "%sUMask: %04o\n"
3580 "%sWorkingDirectory: %s\n"
3581 "%sRootDirectory: %s\n"
3582 "%sNonBlocking: %s\n"
3583 "%sPrivateTmp: %s\n"
3584 "%sPrivateDevices: %s\n"
3585 "%sProtectKernelTunables: %s\n"
3586 "%sProtectKernelModules: %s\n"
3587 "%sProtectControlGroups: %s\n"
3588 "%sPrivateNetwork: %s\n"
3589 "%sPrivateUsers: %s\n"
3590 "%sProtectHome: %s\n"
3591 "%sProtectSystem: %s\n"
3592 "%sMountAPIVFS: %s\n"
3593 "%sIgnoreSIGPIPE: %s\n"
3594 "%sMemoryDenyWriteExecute: %s\n"
3595 "%sRestrictRealtime: %s\n"
3596 "%sKeyringMode: %s\n",
3597 prefix, c->umask,
3598 prefix, c->working_directory ? c->working_directory : "/",
3599 prefix, c->root_directory ? c->root_directory : "/",
3600 prefix, yes_no(c->non_blocking),
3601 prefix, yes_no(c->private_tmp),
3602 prefix, yes_no(c->private_devices),
3603 prefix, yes_no(c->protect_kernel_tunables),
3604 prefix, yes_no(c->protect_kernel_modules),
3605 prefix, yes_no(c->protect_control_groups),
3606 prefix, yes_no(c->private_network),
3607 prefix, yes_no(c->private_users),
3608 prefix, protect_home_to_string(c->protect_home),
3609 prefix, protect_system_to_string(c->protect_system),
3610 prefix, yes_no(c->mount_apivfs),
3611 prefix, yes_no(c->ignore_sigpipe),
3612 prefix, yes_no(c->memory_deny_write_execute),
3613 prefix, yes_no(c->restrict_realtime),
3614 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3615
3616 if (c->root_image)
3617 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3618
3619 STRV_FOREACH(e, c->environment)
3620 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3621
3622 STRV_FOREACH(e, c->environment_files)
3623 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3624
3625 STRV_FOREACH(e, c->pass_environment)
3626 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3627
3628 STRV_FOREACH(e, c->unset_environment)
3629 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3630
3631 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3632
3633 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3634 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3635
3636 STRV_FOREACH(d, c->directories[dt].paths)
3637 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3638 }
3639
3640 if (c->nice_set)
3641 fprintf(f,
3642 "%sNice: %i\n",
3643 prefix, c->nice);
3644
3645 if (c->oom_score_adjust_set)
3646 fprintf(f,
3647 "%sOOMScoreAdjust: %i\n",
3648 prefix, c->oom_score_adjust);
3649
3650 for (i = 0; i < RLIM_NLIMITS; i++)
3651 if (c->rlimit[i]) {
3652 fprintf(f, "%s%s: " RLIM_FMT "\n",
3653 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3654 fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3655 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3656 }
3657
3658 if (c->ioprio_set) {
3659 _cleanup_free_ char *class_str = NULL;
3660
3661 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3662 if (r >= 0)
3663 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3664
3665 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3666 }
3667
3668 if (c->cpu_sched_set) {
3669 _cleanup_free_ char *policy_str = NULL;
3670
3671 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3672 if (r >= 0)
3673 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3674
3675 fprintf(f,
3676 "%sCPUSchedulingPriority: %i\n"
3677 "%sCPUSchedulingResetOnFork: %s\n",
3678 prefix, c->cpu_sched_priority,
3679 prefix, yes_no(c->cpu_sched_reset_on_fork));
3680 }
3681
3682 if (c->cpuset) {
3683 fprintf(f, "%sCPUAffinity:", prefix);
3684 for (i = 0; i < c->cpuset_ncpus; i++)
3685 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
3686 fprintf(f, " %u", i);
3687 fputs("\n", f);
3688 }
3689
3690 if (c->timer_slack_nsec != NSEC_INFINITY)
3691 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
3692
3693 fprintf(f,
3694 "%sStandardInput: %s\n"
3695 "%sStandardOutput: %s\n"
3696 "%sStandardError: %s\n",
3697 prefix, exec_input_to_string(c->std_input),
3698 prefix, exec_output_to_string(c->std_output),
3699 prefix, exec_output_to_string(c->std_error));
3700
3701 if (c->tty_path)
3702 fprintf(f,
3703 "%sTTYPath: %s\n"
3704 "%sTTYReset: %s\n"
3705 "%sTTYVHangup: %s\n"
3706 "%sTTYVTDisallocate: %s\n",
3707 prefix, c->tty_path,
3708 prefix, yes_no(c->tty_reset),
3709 prefix, yes_no(c->tty_vhangup),
3710 prefix, yes_no(c->tty_vt_disallocate));
3711
3712 if (IN_SET(c->std_output,
3713 EXEC_OUTPUT_SYSLOG,
3714 EXEC_OUTPUT_KMSG,
3715 EXEC_OUTPUT_JOURNAL,
3716 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3717 EXEC_OUTPUT_KMSG_AND_CONSOLE,
3718 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
3719 IN_SET(c->std_error,
3720 EXEC_OUTPUT_SYSLOG,
3721 EXEC_OUTPUT_KMSG,
3722 EXEC_OUTPUT_JOURNAL,
3723 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3724 EXEC_OUTPUT_KMSG_AND_CONSOLE,
3725 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
3726
3727 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
3728
3729 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
3730 if (r >= 0)
3731 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
3732
3733 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
3734 if (r >= 0)
3735 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
3736 }
3737
3738 if (c->secure_bits) {
3739 _cleanup_free_ char *str = NULL;
3740
3741 r = secure_bits_to_string_alloc(c->secure_bits, &str);
3742 if (r >= 0)
3743 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
3744 }
3745
3746 if (c->capability_bounding_set != CAP_ALL) {
3747 _cleanup_free_ char *str = NULL;
3748
3749 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
3750 if (r >= 0)
3751 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
3752 }
3753
3754 if (c->capability_ambient_set != 0) {
3755 _cleanup_free_ char *str = NULL;
3756
3757 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
3758 if (r >= 0)
3759 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
3760 }
3761
3762 if (c->user)
3763 fprintf(f, "%sUser: %s\n", prefix, c->user);
3764 if (c->group)
3765 fprintf(f, "%sGroup: %s\n", prefix, c->group);
3766
3767 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
3768
3769 if (strv_length(c->supplementary_groups) > 0) {
3770 fprintf(f, "%sSupplementaryGroups:", prefix);
3771 strv_fprintf(f, c->supplementary_groups);
3772 fputs("\n", f);
3773 }
3774
3775 if (c->pam_name)
3776 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
3777
3778 if (strv_length(c->read_write_paths) > 0) {
3779 fprintf(f, "%sReadWritePaths:", prefix);
3780 strv_fprintf(f, c->read_write_paths);
3781 fputs("\n", f);
3782 }
3783
3784 if (strv_length(c->read_only_paths) > 0) {
3785 fprintf(f, "%sReadOnlyPaths:", prefix);
3786 strv_fprintf(f, c->read_only_paths);
3787 fputs("\n", f);
3788 }
3789
3790 if (strv_length(c->inaccessible_paths) > 0) {
3791 fprintf(f, "%sInaccessiblePaths:", prefix);
3792 strv_fprintf(f, c->inaccessible_paths);
3793 fputs("\n", f);
3794 }
3795
3796 if (c->n_bind_mounts > 0)
3797 for (i = 0; i < c->n_bind_mounts; i++) {
3798 fprintf(f, "%s%s: %s:%s:%s\n", prefix,
3799 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
3800 c->bind_mounts[i].source,
3801 c->bind_mounts[i].destination,
3802 c->bind_mounts[i].recursive ? "rbind" : "norbind");
3803 }
3804
3805 if (c->utmp_id)
3806 fprintf(f,
3807 "%sUtmpIdentifier: %s\n",
3808 prefix, c->utmp_id);
3809
3810 if (c->selinux_context)
3811 fprintf(f,
3812 "%sSELinuxContext: %s%s\n",
3813 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
3814
3815 if (c->apparmor_profile)
3816 fprintf(f,
3817 "%sAppArmorProfile: %s%s\n",
3818 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
3819
3820 if (c->smack_process_label)
3821 fprintf(f,
3822 "%sSmackProcessLabel: %s%s\n",
3823 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
3824
3825 if (c->personality != PERSONALITY_INVALID)
3826 fprintf(f,
3827 "%sPersonality: %s\n",
3828 prefix, strna(personality_to_string(c->personality)));
3829
3830 fprintf(f,
3831 "%sLockPersonality: %s\n",
3832 prefix, yes_no(c->lock_personality));
3833
3834 if (c->syscall_filter) {
3835 #ifdef HAVE_SECCOMP
3836 Iterator j;
3837 void *id;
3838 bool first = true;
3839 #endif
3840
3841 fprintf(f,
3842 "%sSystemCallFilter: ",
3843 prefix);
3844
3845 if (!c->syscall_whitelist)
3846 fputc('~', f);
3847
3848 #ifdef HAVE_SECCOMP
3849 SET_FOREACH(id, c->syscall_filter, j) {
3850 _cleanup_free_ char *name = NULL;
3851
3852 if (first)
3853 first = false;
3854 else
3855 fputc(' ', f);
3856
3857 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
3858 fputs(strna(name), f);
3859 }
3860 #endif
3861
3862 fputc('\n', f);
3863 }
3864
3865 if (c->syscall_archs) {
3866 #ifdef HAVE_SECCOMP
3867 Iterator j;
3868 void *id;
3869 #endif
3870
3871 fprintf(f,
3872 "%sSystemCallArchitectures:",
3873 prefix);
3874
3875 #ifdef HAVE_SECCOMP
3876 SET_FOREACH(id, c->syscall_archs, j)
3877 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
3878 #endif
3879 fputc('\n', f);
3880 }
3881
3882 if (exec_context_restrict_namespaces_set(c)) {
3883 _cleanup_free_ char *s = NULL;
3884
3885 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
3886 if (r >= 0)
3887 fprintf(f, "%sRestrictNamespaces: %s\n",
3888 prefix, s);
3889 }
3890
3891 if (c->syscall_errno > 0)
3892 fprintf(f,
3893 "%sSystemCallErrorNumber: %s\n",
3894 prefix, strna(errno_to_name(c->syscall_errno)));
3895
3896 if (c->apparmor_profile)
3897 fprintf(f,
3898 "%sAppArmorProfile: %s%s\n",
3899 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
3900 }
3901
3902 bool exec_context_maintains_privileges(ExecContext *c) {
3903 assert(c);
3904
3905 /* Returns true if the process forked off would run under
3906 * an unchanged UID or as root. */
3907
3908 if (!c->user)
3909 return true;
3910
3911 if (streq(c->user, "root") || streq(c->user, "0"))
3912 return true;
3913
3914 return false;
3915 }
3916
3917 int exec_context_get_effective_ioprio(ExecContext *c) {
3918 int p;
3919
3920 assert(c);
3921
3922 if (c->ioprio_set)
3923 return c->ioprio;
3924
3925 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
3926 if (p < 0)
3927 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
3928
3929 return p;
3930 }
3931
3932 void exec_status_start(ExecStatus *s, pid_t pid) {
3933 assert(s);
3934
3935 zero(*s);
3936 s->pid = pid;
3937 dual_timestamp_get(&s->start_timestamp);
3938 }
3939
3940 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
3941 assert(s);
3942
3943 if (s->pid && s->pid != pid)
3944 zero(*s);
3945
3946 s->pid = pid;
3947 dual_timestamp_get(&s->exit_timestamp);
3948
3949 s->code = code;
3950 s->status = status;
3951
3952 if (context) {
3953 if (context->utmp_id)
3954 utmp_put_dead_process(context->utmp_id, pid, code, status);
3955
3956 exec_context_tty_reset(context, NULL);
3957 }
3958 }
3959
3960 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
3961 char buf[FORMAT_TIMESTAMP_MAX];
3962
3963 assert(s);
3964 assert(f);
3965
3966 if (s->pid <= 0)
3967 return;
3968
3969 prefix = strempty(prefix);
3970
3971 fprintf(f,
3972 "%sPID: "PID_FMT"\n",
3973 prefix, s->pid);
3974
3975 if (dual_timestamp_is_set(&s->start_timestamp))
3976 fprintf(f,
3977 "%sStart Timestamp: %s\n",
3978 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
3979
3980 if (dual_timestamp_is_set(&s->exit_timestamp))
3981 fprintf(f,
3982 "%sExit Timestamp: %s\n"
3983 "%sExit Code: %s\n"
3984 "%sExit Status: %i\n",
3985 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
3986 prefix, sigchld_code_to_string(s->code),
3987 prefix, s->status);
3988 }
3989
3990 char *exec_command_line(char **argv) {
3991 size_t k;
3992 char *n, *p, **a;
3993 bool first = true;
3994
3995 assert(argv);
3996
3997 k = 1;
3998 STRV_FOREACH(a, argv)
3999 k += strlen(*a)+3;
4000
4001 n = new(char, k);
4002 if (!n)
4003 return NULL;
4004
4005 p = n;
4006 STRV_FOREACH(a, argv) {
4007
4008 if (!first)
4009 *(p++) = ' ';
4010 else
4011 first = false;
4012
4013 if (strpbrk(*a, WHITESPACE)) {
4014 *(p++) = '\'';
4015 p = stpcpy(p, *a);
4016 *(p++) = '\'';
4017 } else
4018 p = stpcpy(p, *a);
4019
4020 }
4021
4022 *p = 0;
4023
4024 /* FIXME: this doesn't really handle arguments that have
4025 * spaces and ticks in them */
4026
4027 return n;
4028 }
4029
4030 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4031 _cleanup_free_ char *cmd = NULL;
4032 const char *prefix2;
4033
4034 assert(c);
4035 assert(f);
4036
4037 prefix = strempty(prefix);
4038 prefix2 = strjoina(prefix, "\t");
4039
4040 cmd = exec_command_line(c->argv);
4041 fprintf(f,
4042 "%sCommand Line: %s\n",
4043 prefix, cmd ? cmd : strerror(ENOMEM));
4044
4045 exec_status_dump(&c->exec_status, f, prefix2);
4046 }
4047
4048 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4049 assert(f);
4050
4051 prefix = strempty(prefix);
4052
4053 LIST_FOREACH(command, c, c)
4054 exec_command_dump(c, f, prefix);
4055 }
4056
4057 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4058 ExecCommand *end;
4059
4060 assert(l);
4061 assert(e);
4062
4063 if (*l) {
4064 /* It's kind of important, that we keep the order here */
4065 LIST_FIND_TAIL(command, *l, end);
4066 LIST_INSERT_AFTER(command, *l, end, e);
4067 } else
4068 *l = e;
4069 }
4070
4071 int exec_command_set(ExecCommand *c, const char *path, ...) {
4072 va_list ap;
4073 char **l, *p;
4074
4075 assert(c);
4076 assert(path);
4077
4078 va_start(ap, path);
4079 l = strv_new_ap(path, ap);
4080 va_end(ap);
4081
4082 if (!l)
4083 return -ENOMEM;
4084
4085 p = strdup(path);
4086 if (!p) {
4087 strv_free(l);
4088 return -ENOMEM;
4089 }
4090
4091 free(c->path);
4092 c->path = p;
4093
4094 strv_free(c->argv);
4095 c->argv = l;
4096
4097 return 0;
4098 }
4099
4100 int exec_command_append(ExecCommand *c, const char *path, ...) {
4101 _cleanup_strv_free_ char **l = NULL;
4102 va_list ap;
4103 int r;
4104
4105 assert(c);
4106 assert(path);
4107
4108 va_start(ap, path);
4109 l = strv_new_ap(path, ap);
4110 va_end(ap);
4111
4112 if (!l)
4113 return -ENOMEM;
4114
4115 r = strv_extend_strv(&c->argv, l, false);
4116 if (r < 0)
4117 return r;
4118
4119 return 0;
4120 }
4121
4122
4123 static int exec_runtime_allocate(ExecRuntime **rt) {
4124
4125 if (*rt)
4126 return 0;
4127
4128 *rt = new0(ExecRuntime, 1);
4129 if (!*rt)
4130 return -ENOMEM;
4131
4132 (*rt)->n_ref = 1;
4133 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4134
4135 return 0;
4136 }
4137
4138 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4139 int r;
4140
4141 assert(rt);
4142 assert(c);
4143 assert(id);
4144
4145 if (*rt)
4146 return 1;
4147
4148 if (!c->private_network && !c->private_tmp)
4149 return 0;
4150
4151 r = exec_runtime_allocate(rt);
4152 if (r < 0)
4153 return r;
4154
4155 if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4156 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4157 return -errno;
4158 }
4159
4160 if (c->private_tmp && !(*rt)->tmp_dir) {
4161 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4162 if (r < 0)
4163 return r;
4164 }
4165
4166 return 1;
4167 }
4168
4169 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4170 assert(r);
4171 assert(r->n_ref > 0);
4172
4173 r->n_ref++;
4174 return r;
4175 }
4176
4177 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4178
4179 if (!r)
4180 return NULL;
4181
4182 assert(r->n_ref > 0);
4183
4184 r->n_ref--;
4185 if (r->n_ref > 0)
4186 return NULL;
4187
4188 free(r->tmp_dir);
4189 free(r->var_tmp_dir);
4190 safe_close_pair(r->netns_storage_socket);
4191 return mfree(r);
4192 }
4193
4194 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4195 assert(u);
4196 assert(f);
4197 assert(fds);
4198
4199 if (!rt)
4200 return 0;
4201
4202 if (rt->tmp_dir)
4203 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4204
4205 if (rt->var_tmp_dir)
4206 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4207
4208 if (rt->netns_storage_socket[0] >= 0) {
4209 int copy;
4210
4211 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4212 if (copy < 0)
4213 return copy;
4214
4215 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4216 }
4217
4218 if (rt->netns_storage_socket[1] >= 0) {
4219 int copy;
4220
4221 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4222 if (copy < 0)
4223 return copy;
4224
4225 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4226 }
4227
4228 return 0;
4229 }
4230
4231 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4232 int r;
4233
4234 assert(rt);
4235 assert(key);
4236 assert(value);
4237
4238 if (streq(key, "tmp-dir")) {
4239 char *copy;
4240
4241 r = exec_runtime_allocate(rt);
4242 if (r < 0)
4243 return log_oom();
4244
4245 copy = strdup(value);
4246 if (!copy)
4247 return log_oom();
4248
4249 free((*rt)->tmp_dir);
4250 (*rt)->tmp_dir = copy;
4251
4252 } else if (streq(key, "var-tmp-dir")) {
4253 char *copy;
4254
4255 r = exec_runtime_allocate(rt);
4256 if (r < 0)
4257 return log_oom();
4258
4259 copy = strdup(value);
4260 if (!copy)
4261 return log_oom();
4262
4263 free((*rt)->var_tmp_dir);
4264 (*rt)->var_tmp_dir = copy;
4265
4266 } else if (streq(key, "netns-socket-0")) {
4267 int fd;
4268
4269 r = exec_runtime_allocate(rt);
4270 if (r < 0)
4271 return log_oom();
4272
4273 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4274 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4275 else {
4276 safe_close((*rt)->netns_storage_socket[0]);
4277 (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4278 }
4279 } else if (streq(key, "netns-socket-1")) {
4280 int fd;
4281
4282 r = exec_runtime_allocate(rt);
4283 if (r < 0)
4284 return log_oom();
4285
4286 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4287 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4288 else {
4289 safe_close((*rt)->netns_storage_socket[1]);
4290 (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4291 }
4292 } else
4293 return 0;
4294
4295 return 1;
4296 }
4297
4298 static void *remove_tmpdir_thread(void *p) {
4299 _cleanup_free_ char *path = p;
4300
4301 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4302 return NULL;
4303 }
4304
4305 void exec_runtime_destroy(ExecRuntime *rt) {
4306 int r;
4307
4308 if (!rt)
4309 return;
4310
4311 /* If there are multiple users of this, let's leave the stuff around */
4312 if (rt->n_ref > 1)
4313 return;
4314
4315 if (rt->tmp_dir) {
4316 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4317
4318 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4319 if (r < 0) {
4320 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4321 free(rt->tmp_dir);
4322 }
4323
4324 rt->tmp_dir = NULL;
4325 }
4326
4327 if (rt->var_tmp_dir) {
4328 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4329
4330 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4331 if (r < 0) {
4332 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4333 free(rt->var_tmp_dir);
4334 }
4335
4336 rt->var_tmp_dir = NULL;
4337 }
4338
4339 safe_close_pair(rt->netns_storage_socket);
4340 }
4341
4342 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4343 [EXEC_INPUT_NULL] = "null",
4344 [EXEC_INPUT_TTY] = "tty",
4345 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4346 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4347 [EXEC_INPUT_SOCKET] = "socket",
4348 [EXEC_INPUT_NAMED_FD] = "fd",
4349 };
4350
4351 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4352
4353 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4354 [EXEC_OUTPUT_INHERIT] = "inherit",
4355 [EXEC_OUTPUT_NULL] = "null",
4356 [EXEC_OUTPUT_TTY] = "tty",
4357 [EXEC_OUTPUT_SYSLOG] = "syslog",
4358 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4359 [EXEC_OUTPUT_KMSG] = "kmsg",
4360 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4361 [EXEC_OUTPUT_JOURNAL] = "journal",
4362 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4363 [EXEC_OUTPUT_SOCKET] = "socket",
4364 [EXEC_OUTPUT_NAMED_FD] = "fd",
4365 };
4366
4367 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4368
4369 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4370 [EXEC_UTMP_INIT] = "init",
4371 [EXEC_UTMP_LOGIN] = "login",
4372 [EXEC_UTMP_USER] = "user",
4373 };
4374
4375 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4376
4377 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4378 [EXEC_PRESERVE_NO] = "no",
4379 [EXEC_PRESERVE_YES] = "yes",
4380 [EXEC_PRESERVE_RESTART] = "restart",
4381 };
4382
4383 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4384
4385 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4386 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4387 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4388 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4389 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4390 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4391 };
4392
4393 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4394
4395 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4396 [EXEC_KEYRING_INHERIT] = "inherit",
4397 [EXEC_KEYRING_PRIVATE] = "private",
4398 [EXEC_KEYRING_SHARED] = "shared",
4399 };
4400
4401 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);