]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #7388 from keszybz/doc-tweak
[thirdparty/systemd.git] / src / core / execute.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <glob.h>
23 #include <grp.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <string.h>
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
29 #include <sys/mman.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
32 #include <sys/shm.h>
33 #include <sys/socket.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <sys/un.h>
37 #include <unistd.h>
38 #include <utmpx.h>
39
40 #if HAVE_PAM
41 #include <security/pam_appl.h>
42 #endif
43
44 #if HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #if HAVE_SECCOMP
49 #include <seccomp.h>
50 #endif
51
52 #if HAVE_APPARMOR
53 #include <sys/apparmor.h>
54 #endif
55
56 #include "sd-messages.h"
57
58 #include "af-list.h"
59 #include "alloc-util.h"
60 #if HAVE_APPARMOR
61 #include "apparmor-util.h"
62 #endif
63 #include "async.h"
64 #include "barrier.h"
65 #include "cap-list.h"
66 #include "capability-util.h"
67 #include "chown-recursive.h"
68 #include "def.h"
69 #include "env-util.h"
70 #include "errno-list.h"
71 #include "execute.h"
72 #include "exit-status.h"
73 #include "fd-util.h"
74 #include "fileio.h"
75 #include "format-util.h"
76 #include "fs-util.h"
77 #include "glob-util.h"
78 #include "io-util.h"
79 #include "ioprio.h"
80 #include "label.h"
81 #include "log.h"
82 #include "macro.h"
83 #include "missing.h"
84 #include "mkdir.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "process-util.h"
89 #include "rlimit-util.h"
90 #include "rm-rf.h"
91 #if HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "securebits.h"
95 #include "securebits-util.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "smack-util.h"
99 #include "special.h"
100 #include "string-table.h"
101 #include "string-util.h"
102 #include "strv.h"
103 #include "syslog-util.h"
104 #include "terminal-util.h"
105 #include "unit.h"
106 #include "user-util.h"
107 #include "util.h"
108 #include "utmp-wtmp.h"
109
110 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
111 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
112
113 /* This assumes there is a 'tty' group */
114 #define TTY_MODE 0620
115
116 #define SNDBUF_SIZE (8*1024*1024)
117
118 static int shift_fds(int fds[], unsigned n_fds) {
119 int start, restart_from;
120
121 if (n_fds <= 0)
122 return 0;
123
124 /* Modifies the fds array! (sorts it) */
125
126 assert(fds);
127
128 start = 0;
129 for (;;) {
130 int i;
131
132 restart_from = -1;
133
134 for (i = start; i < (int) n_fds; i++) {
135 int nfd;
136
137 /* Already at right index? */
138 if (fds[i] == i+3)
139 continue;
140
141 nfd = fcntl(fds[i], F_DUPFD, i + 3);
142 if (nfd < 0)
143 return -errno;
144
145 safe_close(fds[i]);
146 fds[i] = nfd;
147
148 /* Hmm, the fd we wanted isn't free? Then
149 * let's remember that and try again from here */
150 if (nfd != i+3 && restart_from < 0)
151 restart_from = i;
152 }
153
154 if (restart_from < 0)
155 break;
156
157 start = restart_from;
158 }
159
160 return 0;
161 }
162
163 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
164 unsigned i, n_fds;
165 int r;
166
167 n_fds = n_storage_fds + n_socket_fds;
168 if (n_fds <= 0)
169 return 0;
170
171 assert(fds);
172
173 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
174 * O_NONBLOCK only applies to socket activation though. */
175
176 for (i = 0; i < n_fds; i++) {
177
178 if (i < n_socket_fds) {
179 r = fd_nonblock(fds[i], nonblock);
180 if (r < 0)
181 return r;
182 }
183
184 /* We unconditionally drop FD_CLOEXEC from the fds,
185 * since after all we want to pass these fds to our
186 * children */
187
188 r = fd_cloexec(fds[i], false);
189 if (r < 0)
190 return r;
191 }
192
193 return 0;
194 }
195
196 static const char *exec_context_tty_path(const ExecContext *context) {
197 assert(context);
198
199 if (context->stdio_as_fds)
200 return NULL;
201
202 if (context->tty_path)
203 return context->tty_path;
204
205 return "/dev/console";
206 }
207
208 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
209 const char *path;
210
211 assert(context);
212
213 path = exec_context_tty_path(context);
214
215 if (context->tty_vhangup) {
216 if (p && p->stdin_fd >= 0)
217 (void) terminal_vhangup_fd(p->stdin_fd);
218 else if (path)
219 (void) terminal_vhangup(path);
220 }
221
222 if (context->tty_reset) {
223 if (p && p->stdin_fd >= 0)
224 (void) reset_terminal_fd(p->stdin_fd, true);
225 else if (path)
226 (void) reset_terminal(path);
227 }
228
229 if (context->tty_vt_disallocate && path)
230 (void) vt_disallocate(path);
231 }
232
233 static bool is_terminal_input(ExecInput i) {
234 return IN_SET(i,
235 EXEC_INPUT_TTY,
236 EXEC_INPUT_TTY_FORCE,
237 EXEC_INPUT_TTY_FAIL);
238 }
239
240 static bool is_terminal_output(ExecOutput o) {
241 return IN_SET(o,
242 EXEC_OUTPUT_TTY,
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
244 EXEC_OUTPUT_KMSG_AND_CONSOLE,
245 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
246 }
247
248 static bool is_syslog_output(ExecOutput o) {
249 return IN_SET(o,
250 EXEC_OUTPUT_SYSLOG,
251 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
252 }
253
254 static bool is_kmsg_output(ExecOutput o) {
255 return IN_SET(o,
256 EXEC_OUTPUT_KMSG,
257 EXEC_OUTPUT_KMSG_AND_CONSOLE);
258 }
259
260 static bool exec_context_needs_term(const ExecContext *c) {
261 assert(c);
262
263 /* Return true if the execution context suggests we should set $TERM to something useful. */
264
265 if (is_terminal_input(c->std_input))
266 return true;
267
268 if (is_terminal_output(c->std_output))
269 return true;
270
271 if (is_terminal_output(c->std_error))
272 return true;
273
274 return !!c->tty_path;
275 }
276
277 static int open_null_as(int flags, int nfd) {
278 int fd, r;
279
280 assert(nfd >= 0);
281
282 fd = open("/dev/null", flags|O_NOCTTY);
283 if (fd < 0)
284 return -errno;
285
286 if (fd != nfd) {
287 r = dup2(fd, nfd) < 0 ? -errno : nfd;
288 safe_close(fd);
289 } else
290 r = nfd;
291
292 return r;
293 }
294
295 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
296 static const union sockaddr_union sa = {
297 .un.sun_family = AF_UNIX,
298 .un.sun_path = "/run/systemd/journal/stdout",
299 };
300 uid_t olduid = UID_INVALID;
301 gid_t oldgid = GID_INVALID;
302 int r;
303
304 if (gid_is_valid(gid)) {
305 oldgid = getgid();
306
307 if (setegid(gid) < 0)
308 return -errno;
309 }
310
311 if (uid_is_valid(uid)) {
312 olduid = getuid();
313
314 if (seteuid(uid) < 0) {
315 r = -errno;
316 goto restore_gid;
317 }
318 }
319
320 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
321
322 /* If we fail to restore the uid or gid, things will likely
323 fail later on. This should only happen if an LSM interferes. */
324
325 if (uid_is_valid(uid))
326 (void) seteuid(olduid);
327
328 restore_gid:
329 if (gid_is_valid(gid))
330 (void) setegid(oldgid);
331
332 return r;
333 }
334
335 static int connect_logger_as(
336 Unit *unit,
337 const ExecContext *context,
338 const ExecParameters *params,
339 ExecOutput output,
340 const char *ident,
341 int nfd,
342 uid_t uid,
343 gid_t gid) {
344
345 int fd, r;
346
347 assert(context);
348 assert(params);
349 assert(output < _EXEC_OUTPUT_MAX);
350 assert(ident);
351 assert(nfd >= 0);
352
353 fd = socket(AF_UNIX, SOCK_STREAM, 0);
354 if (fd < 0)
355 return -errno;
356
357 r = connect_journal_socket(fd, uid, gid);
358 if (r < 0)
359 return r;
360
361 if (shutdown(fd, SHUT_RD) < 0) {
362 safe_close(fd);
363 return -errno;
364 }
365
366 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
367
368 dprintf(fd,
369 "%s\n"
370 "%s\n"
371 "%i\n"
372 "%i\n"
373 "%i\n"
374 "%i\n"
375 "%i\n",
376 context->syslog_identifier ?: ident,
377 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
378 context->syslog_priority,
379 !!context->syslog_level_prefix,
380 is_syslog_output(output),
381 is_kmsg_output(output),
382 is_terminal_output(output));
383
384 if (fd == nfd)
385 return nfd;
386
387 r = dup2(fd, nfd) < 0 ? -errno : nfd;
388 safe_close(fd);
389
390 return r;
391 }
392 static int open_terminal_as(const char *path, mode_t mode, int nfd) {
393 int fd, r;
394
395 assert(path);
396 assert(nfd >= 0);
397
398 fd = open_terminal(path, mode | O_NOCTTY);
399 if (fd < 0)
400 return fd;
401
402 if (fd != nfd) {
403 r = dup2(fd, nfd) < 0 ? -errno : nfd;
404 safe_close(fd);
405 } else
406 r = nfd;
407
408 return r;
409 }
410
411 static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
412
413 if (is_terminal_input(std_input) && !apply_tty_stdin)
414 return EXEC_INPUT_NULL;
415
416 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
417 return EXEC_INPUT_NULL;
418
419 return std_input;
420 }
421
422 static int fixup_output(ExecOutput std_output, int socket_fd) {
423
424 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
425 return EXEC_OUTPUT_INHERIT;
426
427 return std_output;
428 }
429
430 static int setup_input(
431 const ExecContext *context,
432 const ExecParameters *params,
433 int socket_fd,
434 int named_iofds[3]) {
435
436 ExecInput i;
437
438 assert(context);
439 assert(params);
440
441 if (params->stdin_fd >= 0) {
442 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
443 return -errno;
444
445 /* Try to make this the controlling tty, if it is a tty, and reset it */
446 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
447 (void) reset_terminal_fd(STDIN_FILENO, true);
448
449 return STDIN_FILENO;
450 }
451
452 i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
453
454 switch (i) {
455
456 case EXEC_INPUT_NULL:
457 return open_null_as(O_RDONLY, STDIN_FILENO);
458
459 case EXEC_INPUT_TTY:
460 case EXEC_INPUT_TTY_FORCE:
461 case EXEC_INPUT_TTY_FAIL: {
462 int fd, r;
463
464 fd = acquire_terminal(exec_context_tty_path(context),
465 i == EXEC_INPUT_TTY_FAIL,
466 i == EXEC_INPUT_TTY_FORCE,
467 false,
468 USEC_INFINITY);
469 if (fd < 0)
470 return fd;
471
472 if (fd != STDIN_FILENO) {
473 r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
474 safe_close(fd);
475 } else
476 r = STDIN_FILENO;
477
478 return r;
479 }
480
481 case EXEC_INPUT_SOCKET:
482 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
483
484 case EXEC_INPUT_NAMED_FD:
485 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
486 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
487
488 default:
489 assert_not_reached("Unknown input type");
490 }
491 }
492
493 static int setup_output(
494 Unit *unit,
495 const ExecContext *context,
496 const ExecParameters *params,
497 int fileno,
498 int socket_fd,
499 int named_iofds[3],
500 const char *ident,
501 uid_t uid,
502 gid_t gid,
503 dev_t *journal_stream_dev,
504 ino_t *journal_stream_ino) {
505
506 ExecOutput o;
507 ExecInput i;
508 int r;
509
510 assert(unit);
511 assert(context);
512 assert(params);
513 assert(ident);
514 assert(journal_stream_dev);
515 assert(journal_stream_ino);
516
517 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
518
519 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
520 return -errno;
521
522 return STDOUT_FILENO;
523 }
524
525 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
526 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
527 return -errno;
528
529 return STDERR_FILENO;
530 }
531
532 i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
533 o = fixup_output(context->std_output, socket_fd);
534
535 if (fileno == STDERR_FILENO) {
536 ExecOutput e;
537 e = fixup_output(context->std_error, socket_fd);
538
539 /* This expects the input and output are already set up */
540
541 /* Don't change the stderr file descriptor if we inherit all
542 * the way and are not on a tty */
543 if (e == EXEC_OUTPUT_INHERIT &&
544 o == EXEC_OUTPUT_INHERIT &&
545 i == EXEC_INPUT_NULL &&
546 !is_terminal_input(context->std_input) &&
547 getppid () != 1)
548 return fileno;
549
550 /* Duplicate from stdout if possible */
551 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
552 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
553
554 o = e;
555
556 } else if (o == EXEC_OUTPUT_INHERIT) {
557 /* If input got downgraded, inherit the original value */
558 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
559 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
560
561 /* If the input is connected to anything that's not a /dev/null, inherit that... */
562 if (i != EXEC_INPUT_NULL)
563 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
564
565 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
566 if (getppid() != 1)
567 return fileno;
568
569 /* We need to open /dev/null here anew, to get the right access mode. */
570 return open_null_as(O_WRONLY, fileno);
571 }
572
573 switch (o) {
574
575 case EXEC_OUTPUT_NULL:
576 return open_null_as(O_WRONLY, fileno);
577
578 case EXEC_OUTPUT_TTY:
579 if (is_terminal_input(i))
580 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
581
582 /* We don't reset the terminal if this is just about output */
583 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
584
585 case EXEC_OUTPUT_SYSLOG:
586 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
587 case EXEC_OUTPUT_KMSG:
588 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
589 case EXEC_OUTPUT_JOURNAL:
590 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
591 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
592 if (r < 0) {
593 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
594 r = open_null_as(O_WRONLY, fileno);
595 } else {
596 struct stat st;
597
598 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
599 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
600 * services to detect whether they are connected to the journal or not.
601 *
602 * If both stdout and stderr are connected to a stream then let's make sure to store the data
603 * about STDERR as that's usually the best way to do logging. */
604
605 if (fstat(fileno, &st) >= 0 &&
606 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
607 *journal_stream_dev = st.st_dev;
608 *journal_stream_ino = st.st_ino;
609 }
610 }
611 return r;
612
613 case EXEC_OUTPUT_SOCKET:
614 assert(socket_fd >= 0);
615 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
616
617 case EXEC_OUTPUT_NAMED_FD:
618 (void) fd_nonblock(named_iofds[fileno], false);
619 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
620
621 default:
622 assert_not_reached("Unknown error type");
623 }
624 }
625
626 static int chown_terminal(int fd, uid_t uid) {
627 struct stat st;
628
629 assert(fd >= 0);
630
631 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
632 if (isatty(fd) < 1)
633 return 0;
634
635 /* This might fail. What matters are the results. */
636 (void) fchown(fd, uid, -1);
637 (void) fchmod(fd, TTY_MODE);
638
639 if (fstat(fd, &st) < 0)
640 return -errno;
641
642 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
643 return -EPERM;
644
645 return 0;
646 }
647
648 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
649 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
650 int r;
651
652 assert(_saved_stdin);
653 assert(_saved_stdout);
654
655 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
656 if (saved_stdin < 0)
657 return -errno;
658
659 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
660 if (saved_stdout < 0)
661 return -errno;
662
663 fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
664 if (fd < 0)
665 return fd;
666
667 r = chown_terminal(fd, getuid());
668 if (r < 0)
669 return r;
670
671 r = reset_terminal_fd(fd, true);
672 if (r < 0)
673 return r;
674
675 if (dup2(fd, STDIN_FILENO) < 0)
676 return -errno;
677
678 if (dup2(fd, STDOUT_FILENO) < 0)
679 return -errno;
680
681 if (fd >= 2)
682 safe_close(fd);
683 fd = -1;
684
685 *_saved_stdin = saved_stdin;
686 *_saved_stdout = saved_stdout;
687
688 saved_stdin = saved_stdout = -1;
689
690 return 0;
691 }
692
693 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
694 assert(err < 0);
695
696 if (err == -ETIMEDOUT)
697 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
698 else {
699 errno = -err;
700 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
701 }
702 }
703
704 static void write_confirm_error(int err, const char *vc, const Unit *u) {
705 _cleanup_close_ int fd = -1;
706
707 assert(vc);
708
709 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
710 if (fd < 0)
711 return;
712
713 write_confirm_error_fd(err, fd, u);
714 }
715
716 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
717 int r = 0;
718
719 assert(saved_stdin);
720 assert(saved_stdout);
721
722 release_terminal();
723
724 if (*saved_stdin >= 0)
725 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
726 r = -errno;
727
728 if (*saved_stdout >= 0)
729 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
730 r = -errno;
731
732 *saved_stdin = safe_close(*saved_stdin);
733 *saved_stdout = safe_close(*saved_stdout);
734
735 return r;
736 }
737
738 enum {
739 CONFIRM_PRETEND_FAILURE = -1,
740 CONFIRM_PRETEND_SUCCESS = 0,
741 CONFIRM_EXECUTE = 1,
742 };
743
744 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
745 int saved_stdout = -1, saved_stdin = -1, r;
746 _cleanup_free_ char *e = NULL;
747 char c;
748
749 /* For any internal errors, assume a positive response. */
750 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
751 if (r < 0) {
752 write_confirm_error(r, vc, u);
753 return CONFIRM_EXECUTE;
754 }
755
756 /* confirm_spawn might have been disabled while we were sleeping. */
757 if (manager_is_confirm_spawn_disabled(u->manager)) {
758 r = 1;
759 goto restore_stdio;
760 }
761
762 e = ellipsize(cmdline, 60, 100);
763 if (!e) {
764 log_oom();
765 r = CONFIRM_EXECUTE;
766 goto restore_stdio;
767 }
768
769 for (;;) {
770 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
771 if (r < 0) {
772 write_confirm_error_fd(r, STDOUT_FILENO, u);
773 r = CONFIRM_EXECUTE;
774 goto restore_stdio;
775 }
776
777 switch (c) {
778 case 'c':
779 printf("Resuming normal execution.\n");
780 manager_disable_confirm_spawn();
781 r = 1;
782 break;
783 case 'D':
784 unit_dump(u, stdout, " ");
785 continue; /* ask again */
786 case 'f':
787 printf("Failing execution.\n");
788 r = CONFIRM_PRETEND_FAILURE;
789 break;
790 case 'h':
791 printf(" c - continue, proceed without asking anymore\n"
792 " D - dump, show the state of the unit\n"
793 " f - fail, don't execute the command and pretend it failed\n"
794 " h - help\n"
795 " i - info, show a short summary of the unit\n"
796 " j - jobs, show jobs that are in progress\n"
797 " s - skip, don't execute the command and pretend it succeeded\n"
798 " y - yes, execute the command\n");
799 continue; /* ask again */
800 case 'i':
801 printf(" Description: %s\n"
802 " Unit: %s\n"
803 " Command: %s\n",
804 u->id, u->description, cmdline);
805 continue; /* ask again */
806 case 'j':
807 manager_dump_jobs(u->manager, stdout, " ");
808 continue; /* ask again */
809 case 'n':
810 /* 'n' was removed in favor of 'f'. */
811 printf("Didn't understand 'n', did you mean 'f'?\n");
812 continue; /* ask again */
813 case 's':
814 printf("Skipping execution.\n");
815 r = CONFIRM_PRETEND_SUCCESS;
816 break;
817 case 'y':
818 r = CONFIRM_EXECUTE;
819 break;
820 default:
821 assert_not_reached("Unhandled choice");
822 }
823 break;
824 }
825
826 restore_stdio:
827 restore_confirm_stdio(&saved_stdin, &saved_stdout);
828 return r;
829 }
830
831 static int get_fixed_user(const ExecContext *c, const char **user,
832 uid_t *uid, gid_t *gid,
833 const char **home, const char **shell) {
834 int r;
835 const char *name;
836
837 assert(c);
838
839 if (!c->user)
840 return 0;
841
842 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
843 * (i.e. are "/" or "/bin/nologin"). */
844
845 name = c->user;
846 r = get_user_creds_clean(&name, uid, gid, home, shell);
847 if (r < 0)
848 return r;
849
850 *user = name;
851 return 0;
852 }
853
854 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
855 int r;
856 const char *name;
857
858 assert(c);
859
860 if (!c->group)
861 return 0;
862
863 name = c->group;
864 r = get_group_creds(&name, gid);
865 if (r < 0)
866 return r;
867
868 *group = name;
869 return 0;
870 }
871
872 static int get_supplementary_groups(const ExecContext *c, const char *user,
873 const char *group, gid_t gid,
874 gid_t **supplementary_gids, int *ngids) {
875 char **i;
876 int r, k = 0;
877 int ngroups_max;
878 bool keep_groups = false;
879 gid_t *groups = NULL;
880 _cleanup_free_ gid_t *l_gids = NULL;
881
882 assert(c);
883
884 /*
885 * If user is given, then lookup GID and supplementary groups list.
886 * We avoid NSS lookups for gid=0. Also we have to initialize groups
887 * here and as early as possible so we keep the list of supplementary
888 * groups of the caller.
889 */
890 if (user && gid_is_valid(gid) && gid != 0) {
891 /* First step, initialize groups from /etc/groups */
892 if (initgroups(user, gid) < 0)
893 return -errno;
894
895 keep_groups = true;
896 }
897
898 if (strv_isempty(c->supplementary_groups))
899 return 0;
900
901 /*
902 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
903 * be positive, otherwise fail.
904 */
905 errno = 0;
906 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
907 if (ngroups_max <= 0) {
908 if (errno > 0)
909 return -errno;
910 else
911 return -EOPNOTSUPP; /* For all other values */
912 }
913
914 l_gids = new(gid_t, ngroups_max);
915 if (!l_gids)
916 return -ENOMEM;
917
918 if (keep_groups) {
919 /*
920 * Lookup the list of groups that the user belongs to, we
921 * avoid NSS lookups here too for gid=0.
922 */
923 k = ngroups_max;
924 if (getgrouplist(user, gid, l_gids, &k) < 0)
925 return -EINVAL;
926 } else
927 k = 0;
928
929 STRV_FOREACH(i, c->supplementary_groups) {
930 const char *g;
931
932 if (k >= ngroups_max)
933 return -E2BIG;
934
935 g = *i;
936 r = get_group_creds(&g, l_gids+k);
937 if (r < 0)
938 return r;
939
940 k++;
941 }
942
943 /*
944 * Sets ngids to zero to drop all supplementary groups, happens
945 * when we are under root and SupplementaryGroups= is empty.
946 */
947 if (k == 0) {
948 *ngids = 0;
949 return 0;
950 }
951
952 /* Otherwise get the final list of supplementary groups */
953 groups = memdup(l_gids, sizeof(gid_t) * k);
954 if (!groups)
955 return -ENOMEM;
956
957 *supplementary_gids = groups;
958 *ngids = k;
959
960 groups = NULL;
961
962 return 0;
963 }
964
965 static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
966 int r;
967
968 /* Handle SupplementaryGroups= if it is not empty */
969 if (ngids > 0) {
970 r = maybe_setgroups(ngids, supplementary_gids);
971 if (r < 0)
972 return r;
973 }
974
975 if (gid_is_valid(gid)) {
976 /* Then set our gids */
977 if (setresgid(gid, gid, gid) < 0)
978 return -errno;
979 }
980
981 return 0;
982 }
983
984 static int enforce_user(const ExecContext *context, uid_t uid) {
985 assert(context);
986
987 if (!uid_is_valid(uid))
988 return 0;
989
990 /* Sets (but doesn't look up) the uid and make sure we keep the
991 * capabilities while doing so. */
992
993 if (context->capability_ambient_set != 0) {
994
995 /* First step: If we need to keep capabilities but
996 * drop privileges we need to make sure we keep our
997 * caps, while we drop privileges. */
998 if (uid != 0) {
999 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1000
1001 if (prctl(PR_GET_SECUREBITS) != sb)
1002 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1003 return -errno;
1004 }
1005 }
1006
1007 /* Second step: actually set the uids */
1008 if (setresuid(uid, uid, uid) < 0)
1009 return -errno;
1010
1011 /* At this point we should have all necessary capabilities but
1012 are otherwise a normal user. However, the caps might got
1013 corrupted due to the setresuid() so we need clean them up
1014 later. This is done outside of this call. */
1015
1016 return 0;
1017 }
1018
1019 #if HAVE_PAM
1020
1021 static int null_conv(
1022 int num_msg,
1023 const struct pam_message **msg,
1024 struct pam_response **resp,
1025 void *appdata_ptr) {
1026
1027 /* We don't support conversations */
1028
1029 return PAM_CONV_ERR;
1030 }
1031
1032 #endif
1033
1034 static int setup_pam(
1035 const char *name,
1036 const char *user,
1037 uid_t uid,
1038 gid_t gid,
1039 const char *tty,
1040 char ***env,
1041 int fds[], unsigned n_fds) {
1042
1043 #if HAVE_PAM
1044
1045 static const struct pam_conv conv = {
1046 .conv = null_conv,
1047 .appdata_ptr = NULL
1048 };
1049
1050 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1051 pam_handle_t *handle = NULL;
1052 sigset_t old_ss;
1053 int pam_code = PAM_SUCCESS, r;
1054 char **nv, **e = NULL;
1055 bool close_session = false;
1056 pid_t pam_pid = 0, parent_pid;
1057 int flags = 0;
1058
1059 assert(name);
1060 assert(user);
1061 assert(env);
1062
1063 /* We set up PAM in the parent process, then fork. The child
1064 * will then stay around until killed via PR_GET_PDEATHSIG or
1065 * systemd via the cgroup logic. It will then remove the PAM
1066 * session again. The parent process will exec() the actual
1067 * daemon. We do things this way to ensure that the main PID
1068 * of the daemon is the one we initially fork()ed. */
1069
1070 r = barrier_create(&barrier);
1071 if (r < 0)
1072 goto fail;
1073
1074 if (log_get_max_level() < LOG_DEBUG)
1075 flags |= PAM_SILENT;
1076
1077 pam_code = pam_start(name, user, &conv, &handle);
1078 if (pam_code != PAM_SUCCESS) {
1079 handle = NULL;
1080 goto fail;
1081 }
1082
1083 if (tty) {
1084 pam_code = pam_set_item(handle, PAM_TTY, tty);
1085 if (pam_code != PAM_SUCCESS)
1086 goto fail;
1087 }
1088
1089 STRV_FOREACH(nv, *env) {
1090 pam_code = pam_putenv(handle, *nv);
1091 if (pam_code != PAM_SUCCESS)
1092 goto fail;
1093 }
1094
1095 pam_code = pam_acct_mgmt(handle, flags);
1096 if (pam_code != PAM_SUCCESS)
1097 goto fail;
1098
1099 pam_code = pam_open_session(handle, flags);
1100 if (pam_code != PAM_SUCCESS)
1101 goto fail;
1102
1103 close_session = true;
1104
1105 e = pam_getenvlist(handle);
1106 if (!e) {
1107 pam_code = PAM_BUF_ERR;
1108 goto fail;
1109 }
1110
1111 /* Block SIGTERM, so that we know that it won't get lost in
1112 * the child */
1113
1114 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1115
1116 parent_pid = getpid_cached();
1117
1118 pam_pid = fork();
1119 if (pam_pid < 0) {
1120 r = -errno;
1121 goto fail;
1122 }
1123
1124 if (pam_pid == 0) {
1125 int sig, ret = EXIT_PAM;
1126
1127 /* The child's job is to reset the PAM session on
1128 * termination */
1129 barrier_set_role(&barrier, BARRIER_CHILD);
1130
1131 /* This string must fit in 10 chars (i.e. the length
1132 * of "/sbin/init"), to look pretty in /bin/ps */
1133 rename_process("(sd-pam)");
1134
1135 /* Make sure we don't keep open the passed fds in this
1136 child. We assume that otherwise only those fds are
1137 open here that have been opened by PAM. */
1138 close_many(fds, n_fds);
1139
1140 /* Drop privileges - we don't need any to pam_close_session
1141 * and this will make PR_SET_PDEATHSIG work in most cases.
1142 * If this fails, ignore the error - but expect sd-pam threads
1143 * to fail to exit normally */
1144
1145 r = maybe_setgroups(0, NULL);
1146 if (r < 0)
1147 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1148 if (setresgid(gid, gid, gid) < 0)
1149 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1150 if (setresuid(uid, uid, uid) < 0)
1151 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1152
1153 (void) ignore_signals(SIGPIPE, -1);
1154
1155 /* Wait until our parent died. This will only work if
1156 * the above setresuid() succeeds, otherwise the kernel
1157 * will not allow unprivileged parents kill their privileged
1158 * children this way. We rely on the control groups kill logic
1159 * to do the rest for us. */
1160 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1161 goto child_finish;
1162
1163 /* Tell the parent that our setup is done. This is especially
1164 * important regarding dropping privileges. Otherwise, unit
1165 * setup might race against our setresuid(2) call.
1166 *
1167 * If the parent aborted, we'll detect this below, hence ignore
1168 * return failure here. */
1169 (void) barrier_place(&barrier);
1170
1171 /* Check if our parent process might already have died? */
1172 if (getppid() == parent_pid) {
1173 sigset_t ss;
1174
1175 assert_se(sigemptyset(&ss) >= 0);
1176 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1177
1178 for (;;) {
1179 if (sigwait(&ss, &sig) < 0) {
1180 if (errno == EINTR)
1181 continue;
1182
1183 goto child_finish;
1184 }
1185
1186 assert(sig == SIGTERM);
1187 break;
1188 }
1189 }
1190
1191 /* If our parent died we'll end the session */
1192 if (getppid() != parent_pid) {
1193 pam_code = pam_close_session(handle, flags);
1194 if (pam_code != PAM_SUCCESS)
1195 goto child_finish;
1196 }
1197
1198 ret = 0;
1199
1200 child_finish:
1201 pam_end(handle, pam_code | flags);
1202 _exit(ret);
1203 }
1204
1205 barrier_set_role(&barrier, BARRIER_PARENT);
1206
1207 /* If the child was forked off successfully it will do all the
1208 * cleanups, so forget about the handle here. */
1209 handle = NULL;
1210
1211 /* Unblock SIGTERM again in the parent */
1212 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1213
1214 /* We close the log explicitly here, since the PAM modules
1215 * might have opened it, but we don't want this fd around. */
1216 closelog();
1217
1218 /* Synchronously wait for the child to initialize. We don't care for
1219 * errors as we cannot recover. However, warn loudly if it happens. */
1220 if (!barrier_place_and_sync(&barrier))
1221 log_error("PAM initialization failed");
1222
1223 strv_free(*env);
1224 *env = e;
1225
1226 return 0;
1227
1228 fail:
1229 if (pam_code != PAM_SUCCESS) {
1230 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1231 r = -EPERM; /* PAM errors do not map to errno */
1232 } else
1233 log_error_errno(r, "PAM failed: %m");
1234
1235 if (handle) {
1236 if (close_session)
1237 pam_code = pam_close_session(handle, flags);
1238
1239 pam_end(handle, pam_code | flags);
1240 }
1241
1242 strv_free(e);
1243 closelog();
1244
1245 return r;
1246 #else
1247 return 0;
1248 #endif
1249 }
1250
1251 static void rename_process_from_path(const char *path) {
1252 char process_name[11];
1253 const char *p;
1254 size_t l;
1255
1256 /* This resulting string must fit in 10 chars (i.e. the length
1257 * of "/sbin/init") to look pretty in /bin/ps */
1258
1259 p = basename(path);
1260 if (isempty(p)) {
1261 rename_process("(...)");
1262 return;
1263 }
1264
1265 l = strlen(p);
1266 if (l > 8) {
1267 /* The end of the process name is usually more
1268 * interesting, since the first bit might just be
1269 * "systemd-" */
1270 p = p + l - 8;
1271 l = 8;
1272 }
1273
1274 process_name[0] = '(';
1275 memcpy(process_name+1, p, l);
1276 process_name[1+l] = ')';
1277 process_name[1+l+1] = 0;
1278
1279 rename_process(process_name);
1280 }
1281
1282 static bool context_has_address_families(const ExecContext *c) {
1283 assert(c);
1284
1285 return c->address_families_whitelist ||
1286 !set_isempty(c->address_families);
1287 }
1288
1289 static bool context_has_syscall_filters(const ExecContext *c) {
1290 assert(c);
1291
1292 return c->syscall_whitelist ||
1293 !hashmap_isempty(c->syscall_filter);
1294 }
1295
1296 static bool context_has_no_new_privileges(const ExecContext *c) {
1297 assert(c);
1298
1299 if (c->no_new_privileges)
1300 return true;
1301
1302 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1303 return false;
1304
1305 /* We need NNP if we have any form of seccomp and are unprivileged */
1306 return context_has_address_families(c) ||
1307 c->memory_deny_write_execute ||
1308 c->restrict_realtime ||
1309 exec_context_restrict_namespaces_set(c) ||
1310 c->protect_kernel_tunables ||
1311 c->protect_kernel_modules ||
1312 c->private_devices ||
1313 context_has_syscall_filters(c) ||
1314 !set_isempty(c->syscall_archs) ||
1315 c->lock_personality;
1316 }
1317
1318 #if HAVE_SECCOMP
1319
1320 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1321
1322 if (is_seccomp_available())
1323 return false;
1324
1325 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1326 return true;
1327 }
1328
1329 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1330 uint32_t negative_action, default_action, action;
1331 int r;
1332
1333 assert(u);
1334 assert(c);
1335
1336 if (!context_has_syscall_filters(c))
1337 return 0;
1338
1339 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1340 return 0;
1341
1342 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1343
1344 if (c->syscall_whitelist) {
1345 default_action = negative_action;
1346 action = SCMP_ACT_ALLOW;
1347 } else {
1348 default_action = SCMP_ACT_ALLOW;
1349 action = negative_action;
1350 }
1351
1352 if (needs_ambient_hack) {
1353 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1354 if (r < 0)
1355 return r;
1356 }
1357
1358 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1359 }
1360
1361 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1362 assert(u);
1363 assert(c);
1364
1365 if (set_isempty(c->syscall_archs))
1366 return 0;
1367
1368 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1369 return 0;
1370
1371 return seccomp_restrict_archs(c->syscall_archs);
1372 }
1373
1374 static int apply_address_families(const Unit* u, const ExecContext *c) {
1375 assert(u);
1376 assert(c);
1377
1378 if (!context_has_address_families(c))
1379 return 0;
1380
1381 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1382 return 0;
1383
1384 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1385 }
1386
1387 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1388 assert(u);
1389 assert(c);
1390
1391 if (!c->memory_deny_write_execute)
1392 return 0;
1393
1394 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1395 return 0;
1396
1397 return seccomp_memory_deny_write_execute();
1398 }
1399
1400 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1401 assert(u);
1402 assert(c);
1403
1404 if (!c->restrict_realtime)
1405 return 0;
1406
1407 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1408 return 0;
1409
1410 return seccomp_restrict_realtime();
1411 }
1412
1413 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1414 assert(u);
1415 assert(c);
1416
1417 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1418 * let's protect even those systems where this is left on in the kernel. */
1419
1420 if (!c->protect_kernel_tunables)
1421 return 0;
1422
1423 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1424 return 0;
1425
1426 return seccomp_protect_sysctl();
1427 }
1428
1429 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1430 assert(u);
1431 assert(c);
1432
1433 /* Turn off module syscalls on ProtectKernelModules=yes */
1434
1435 if (!c->protect_kernel_modules)
1436 return 0;
1437
1438 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1439 return 0;
1440
1441 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1442 }
1443
1444 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1445 assert(u);
1446 assert(c);
1447
1448 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1449
1450 if (!c->private_devices)
1451 return 0;
1452
1453 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1454 return 0;
1455
1456 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1457 }
1458
1459 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1460 assert(u);
1461 assert(c);
1462
1463 if (!exec_context_restrict_namespaces_set(c))
1464 return 0;
1465
1466 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1467 return 0;
1468
1469 return seccomp_restrict_namespaces(c->restrict_namespaces);
1470 }
1471
1472 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1473 unsigned long personality;
1474 int r;
1475
1476 assert(u);
1477 assert(c);
1478
1479 if (!c->lock_personality)
1480 return 0;
1481
1482 if (skip_seccomp_unavailable(u, "LockPersonality="))
1483 return 0;
1484
1485 personality = c->personality;
1486
1487 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1488 if (personality == PERSONALITY_INVALID) {
1489
1490 r = opinionated_personality(&personality);
1491 if (r < 0)
1492 return r;
1493 }
1494
1495 return seccomp_lock_personality(personality);
1496 }
1497
1498 #endif
1499
1500 static void do_idle_pipe_dance(int idle_pipe[4]) {
1501 assert(idle_pipe);
1502
1503 idle_pipe[1] = safe_close(idle_pipe[1]);
1504 idle_pipe[2] = safe_close(idle_pipe[2]);
1505
1506 if (idle_pipe[0] >= 0) {
1507 int r;
1508
1509 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1510
1511 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1512 ssize_t n;
1513
1514 /* Signal systemd that we are bored and want to continue. */
1515 n = write(idle_pipe[3], "x", 1);
1516 if (n > 0)
1517 /* Wait for systemd to react to the signal above. */
1518 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1519 }
1520
1521 idle_pipe[0] = safe_close(idle_pipe[0]);
1522
1523 }
1524
1525 idle_pipe[3] = safe_close(idle_pipe[3]);
1526 }
1527
1528 static int build_environment(
1529 Unit *u,
1530 const ExecContext *c,
1531 const ExecParameters *p,
1532 unsigned n_fds,
1533 const char *home,
1534 const char *username,
1535 const char *shell,
1536 dev_t journal_stream_dev,
1537 ino_t journal_stream_ino,
1538 char ***ret) {
1539
1540 _cleanup_strv_free_ char **our_env = NULL;
1541 unsigned n_env = 0;
1542 char *x;
1543
1544 assert(u);
1545 assert(c);
1546 assert(ret);
1547
1548 our_env = new0(char*, 14);
1549 if (!our_env)
1550 return -ENOMEM;
1551
1552 if (n_fds > 0) {
1553 _cleanup_free_ char *joined = NULL;
1554
1555 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1556 return -ENOMEM;
1557 our_env[n_env++] = x;
1558
1559 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1560 return -ENOMEM;
1561 our_env[n_env++] = x;
1562
1563 joined = strv_join(p->fd_names, ":");
1564 if (!joined)
1565 return -ENOMEM;
1566
1567 x = strjoin("LISTEN_FDNAMES=", joined);
1568 if (!x)
1569 return -ENOMEM;
1570 our_env[n_env++] = x;
1571 }
1572
1573 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1574 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1575 return -ENOMEM;
1576 our_env[n_env++] = x;
1577
1578 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1579 return -ENOMEM;
1580 our_env[n_env++] = x;
1581 }
1582
1583 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1584 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1585 * check the database directly. */
1586 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1587 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1588 if (!x)
1589 return -ENOMEM;
1590 our_env[n_env++] = x;
1591 }
1592
1593 if (home) {
1594 x = strappend("HOME=", home);
1595 if (!x)
1596 return -ENOMEM;
1597 our_env[n_env++] = x;
1598 }
1599
1600 if (username) {
1601 x = strappend("LOGNAME=", username);
1602 if (!x)
1603 return -ENOMEM;
1604 our_env[n_env++] = x;
1605
1606 x = strappend("USER=", username);
1607 if (!x)
1608 return -ENOMEM;
1609 our_env[n_env++] = x;
1610 }
1611
1612 if (shell) {
1613 x = strappend("SHELL=", shell);
1614 if (!x)
1615 return -ENOMEM;
1616 our_env[n_env++] = x;
1617 }
1618
1619 if (!sd_id128_is_null(u->invocation_id)) {
1620 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1621 return -ENOMEM;
1622
1623 our_env[n_env++] = x;
1624 }
1625
1626 if (exec_context_needs_term(c)) {
1627 const char *tty_path, *term = NULL;
1628
1629 tty_path = exec_context_tty_path(c);
1630
1631 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1632 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1633 * passes to PID 1 ends up all the way in the console login shown. */
1634
1635 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1636 term = getenv("TERM");
1637 if (!term)
1638 term = default_term_for_tty(tty_path);
1639
1640 x = strappend("TERM=", term);
1641 if (!x)
1642 return -ENOMEM;
1643 our_env[n_env++] = x;
1644 }
1645
1646 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1647 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1648 return -ENOMEM;
1649
1650 our_env[n_env++] = x;
1651 }
1652
1653 our_env[n_env++] = NULL;
1654 assert(n_env <= 12);
1655
1656 *ret = our_env;
1657 our_env = NULL;
1658
1659 return 0;
1660 }
1661
1662 static int build_pass_environment(const ExecContext *c, char ***ret) {
1663 _cleanup_strv_free_ char **pass_env = NULL;
1664 size_t n_env = 0, n_bufsize = 0;
1665 char **i;
1666
1667 STRV_FOREACH(i, c->pass_environment) {
1668 _cleanup_free_ char *x = NULL;
1669 char *v;
1670
1671 v = getenv(*i);
1672 if (!v)
1673 continue;
1674 x = strjoin(*i, "=", v);
1675 if (!x)
1676 return -ENOMEM;
1677
1678 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1679 return -ENOMEM;
1680
1681 pass_env[n_env++] = x;
1682 pass_env[n_env] = NULL;
1683 x = NULL;
1684 }
1685
1686 *ret = pass_env;
1687 pass_env = NULL;
1688
1689 return 0;
1690 }
1691
1692 static bool exec_needs_mount_namespace(
1693 const ExecContext *context,
1694 const ExecParameters *params,
1695 ExecRuntime *runtime) {
1696
1697 assert(context);
1698 assert(params);
1699
1700 if (context->root_image)
1701 return true;
1702
1703 if (!strv_isempty(context->read_write_paths) ||
1704 !strv_isempty(context->read_only_paths) ||
1705 !strv_isempty(context->inaccessible_paths))
1706 return true;
1707
1708 if (context->n_bind_mounts > 0 ||
1709 !strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
1710 !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1711 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1712 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths) ||
1713 !strv_isempty(context->directories[EXEC_DIRECTORY_CONFIGURATION].paths))
1714 return true;
1715
1716 if (context->mount_flags != 0)
1717 return true;
1718
1719 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1720 return true;
1721
1722 if (context->private_devices ||
1723 context->protect_system != PROTECT_SYSTEM_NO ||
1724 context->protect_home != PROTECT_HOME_NO ||
1725 context->protect_kernel_tunables ||
1726 context->protect_kernel_modules ||
1727 context->protect_control_groups)
1728 return true;
1729
1730 if (context->mount_apivfs && (context->root_image || context->root_directory))
1731 return true;
1732
1733 return false;
1734 }
1735
1736 static int setup_private_users(uid_t uid, gid_t gid) {
1737 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1738 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1739 _cleanup_close_ int unshare_ready_fd = -1;
1740 _cleanup_(sigkill_waitp) pid_t pid = 0;
1741 uint64_t c = 1;
1742 siginfo_t si;
1743 ssize_t n;
1744 int r;
1745
1746 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1747 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1748 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1749 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1750 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1751 * continues execution normally. */
1752
1753 if (uid != 0 && uid_is_valid(uid)) {
1754 r = asprintf(&uid_map,
1755 "0 0 1\n" /* Map root → root */
1756 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1757 uid, uid);
1758 if (r < 0)
1759 return -ENOMEM;
1760 } else {
1761 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1762 if (!uid_map)
1763 return -ENOMEM;
1764 }
1765
1766 if (gid != 0 && gid_is_valid(gid)) {
1767 r = asprintf(&gid_map,
1768 "0 0 1\n" /* Map root → root */
1769 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1770 gid, gid);
1771 if (r < 0)
1772 return -ENOMEM;
1773 } else {
1774 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1775 if (!gid_map)
1776 return -ENOMEM;
1777 }
1778
1779 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1780 * namespace. */
1781 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1782 if (unshare_ready_fd < 0)
1783 return -errno;
1784
1785 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1786 * failed. */
1787 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1788 return -errno;
1789
1790 pid = fork();
1791 if (pid < 0)
1792 return -errno;
1793
1794 if (pid == 0) {
1795 _cleanup_close_ int fd = -1;
1796 const char *a;
1797 pid_t ppid;
1798
1799 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1800 * here, after the parent opened its own user namespace. */
1801
1802 ppid = getppid();
1803 errno_pipe[0] = safe_close(errno_pipe[0]);
1804
1805 /* Wait until the parent unshared the user namespace */
1806 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1807 r = -errno;
1808 goto child_fail;
1809 }
1810
1811 /* Disable the setgroups() system call in the child user namespace, for good. */
1812 a = procfs_file_alloca(ppid, "setgroups");
1813 fd = open(a, O_WRONLY|O_CLOEXEC);
1814 if (fd < 0) {
1815 if (errno != ENOENT) {
1816 r = -errno;
1817 goto child_fail;
1818 }
1819
1820 /* If the file is missing the kernel is too old, let's continue anyway. */
1821 } else {
1822 if (write(fd, "deny\n", 5) < 0) {
1823 r = -errno;
1824 goto child_fail;
1825 }
1826
1827 fd = safe_close(fd);
1828 }
1829
1830 /* First write the GID map */
1831 a = procfs_file_alloca(ppid, "gid_map");
1832 fd = open(a, O_WRONLY|O_CLOEXEC);
1833 if (fd < 0) {
1834 r = -errno;
1835 goto child_fail;
1836 }
1837 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1838 r = -errno;
1839 goto child_fail;
1840 }
1841 fd = safe_close(fd);
1842
1843 /* The write the UID map */
1844 a = procfs_file_alloca(ppid, "uid_map");
1845 fd = open(a, O_WRONLY|O_CLOEXEC);
1846 if (fd < 0) {
1847 r = -errno;
1848 goto child_fail;
1849 }
1850 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1851 r = -errno;
1852 goto child_fail;
1853 }
1854
1855 _exit(EXIT_SUCCESS);
1856
1857 child_fail:
1858 (void) write(errno_pipe[1], &r, sizeof(r));
1859 _exit(EXIT_FAILURE);
1860 }
1861
1862 errno_pipe[1] = safe_close(errno_pipe[1]);
1863
1864 if (unshare(CLONE_NEWUSER) < 0)
1865 return -errno;
1866
1867 /* Let the child know that the namespace is ready now */
1868 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1869 return -errno;
1870
1871 /* Try to read an error code from the child */
1872 n = read(errno_pipe[0], &r, sizeof(r));
1873 if (n < 0)
1874 return -errno;
1875 if (n == sizeof(r)) { /* an error code was sent to us */
1876 if (r < 0)
1877 return r;
1878 return -EIO;
1879 }
1880 if (n != 0) /* on success we should have read 0 bytes */
1881 return -EIO;
1882
1883 r = wait_for_terminate(pid, &si);
1884 if (r < 0)
1885 return r;
1886 pid = 0;
1887
1888 /* If something strange happened with the child, let's consider this fatal, too */
1889 if (si.si_code != CLD_EXITED || si.si_status != 0)
1890 return -EIO;
1891
1892 return 0;
1893 }
1894
1895 static int setup_exec_directory(
1896 const ExecContext *context,
1897 const ExecParameters *params,
1898 uid_t uid,
1899 gid_t gid,
1900 ExecDirectoryType type,
1901 int *exit_status) {
1902
1903 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1904 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1905 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1906 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1907 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1908 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1909 };
1910 char **rt;
1911 int r;
1912
1913 assert(context);
1914 assert(params);
1915 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1916 assert(exit_status);
1917
1918 if (!params->prefix[type])
1919 return 0;
1920
1921 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1922 if (!uid_is_valid(uid))
1923 uid = 0;
1924 if (!gid_is_valid(gid))
1925 gid = 0;
1926 }
1927
1928 STRV_FOREACH(rt, context->directories[type].paths) {
1929 _cleanup_free_ char *p = NULL, *pp = NULL;
1930 const char *effective;
1931
1932 p = strjoin(params->prefix[type], "/", *rt);
1933 if (!p) {
1934 r = -ENOMEM;
1935 goto fail;
1936 }
1937
1938 r = mkdir_parents_label(p, 0755);
1939 if (r < 0)
1940 goto fail;
1941
1942 if (context->dynamic_user &&
1943 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
1944 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
1945
1946 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1947 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1948 * whose UID is later on reused. To lock this down we use the same trick used by container
1949 * managers to prohibit host users to get access to files of the same UID in containers: we
1950 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1951 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1952 * to make this directory permeable for the service itself.
1953 *
1954 * Specifically: for a service which wants a special directory "foo/" we first create a
1955 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1956 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1957 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1958 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1959 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1960 * disabling the access boundary for the service and making sure it only gets access to the
1961 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1962 *
1963 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1964 * owned by the service itself.
1965 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
1966 * files or sockets with other services. */
1967
1968 private_root = strjoin(params->prefix[type], "/private");
1969 if (!private_root) {
1970 r = -ENOMEM;
1971 goto fail;
1972 }
1973
1974 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1975 r = mkdir_safe_label(private_root, 0700, 0, 0, false);
1976 if (r < 0)
1977 goto fail;
1978
1979 pp = strjoin(private_root, "/", *rt);
1980 if (!pp) {
1981 r = -ENOMEM;
1982 goto fail;
1983 }
1984
1985 /* Create all directories between the configured directory and this private root, and mark them 0755 */
1986 r = mkdir_parents_label(pp, 0755);
1987 if (r < 0)
1988 goto fail;
1989
1990 /* Finally, create the actual directory for the service */
1991 r = mkdir_label(pp, context->directories[type].mode);
1992 if (r < 0 && r != -EEXIST)
1993 goto fail;
1994
1995 parent = dirname_malloc(p);
1996 if (!parent) {
1997 r = -ENOMEM;
1998 goto fail;
1999 }
2000
2001 r = path_make_relative(parent, pp, &relative);
2002 if (r < 0)
2003 goto fail;
2004
2005 /* And link it up from the original place */
2006 r = symlink_idempotent(relative, p);
2007 if (r < 0)
2008 goto fail;
2009
2010 effective = pp;
2011
2012 } else {
2013 r = mkdir_label(p, context->directories[type].mode);
2014 if (r < 0 && r != -EEXIST)
2015 goto fail;
2016
2017 effective = p;
2018 }
2019
2020 /* First lock down the access mode */
2021 if (chmod(effective, context->directories[type].mode) < 0) {
2022 r = -errno;
2023 goto fail;
2024 }
2025
2026 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2027 * a service, and shall not be writable. */
2028 if (type == EXEC_DIRECTORY_CONFIGURATION)
2029 continue;
2030
2031 /* Then, change the ownership of the whole tree, if necessary */
2032 r = path_chown_recursive(effective, uid, gid);
2033 if (r < 0)
2034 goto fail;
2035 }
2036
2037 return 0;
2038
2039 fail:
2040 *exit_status = exit_status_table[type];
2041 return r;
2042 }
2043
2044 static int setup_smack(
2045 const ExecContext *context,
2046 const ExecCommand *command) {
2047
2048 int r;
2049
2050 assert(context);
2051 assert(command);
2052
2053 if (context->smack_process_label) {
2054 r = mac_smack_apply_pid(0, context->smack_process_label);
2055 if (r < 0)
2056 return r;
2057 }
2058 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2059 else {
2060 _cleanup_free_ char *exec_label = NULL;
2061
2062 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2063 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2064 return r;
2065
2066 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2067 if (r < 0)
2068 return r;
2069 }
2070 #endif
2071
2072 return 0;
2073 }
2074
2075 static int compile_bind_mounts(
2076 const ExecContext *context,
2077 const ExecParameters *params,
2078 BindMount **ret_bind_mounts,
2079 unsigned *ret_n_bind_mounts,
2080 char ***ret_empty_directories) {
2081
2082 _cleanup_strv_free_ char **empty_directories = NULL;
2083 BindMount *bind_mounts;
2084 unsigned n, h = 0, i;
2085 ExecDirectoryType t;
2086 int r;
2087
2088 assert(context);
2089 assert(params);
2090 assert(ret_bind_mounts);
2091 assert(ret_n_bind_mounts);
2092 assert(ret_empty_directories);
2093
2094 n = context->n_bind_mounts;
2095 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2096 if (!params->prefix[t])
2097 continue;
2098
2099 n += strv_length(context->directories[t].paths);
2100 }
2101
2102 if (n <= 0) {
2103 *ret_bind_mounts = NULL;
2104 *ret_n_bind_mounts = 0;
2105 *ret_empty_directories = NULL;
2106 return 0;
2107 }
2108
2109 bind_mounts = new(BindMount, n);
2110 if (!bind_mounts)
2111 return -ENOMEM;
2112
2113 for (i = 0; i < context->n_bind_mounts; i++) {
2114 BindMount *item = context->bind_mounts + i;
2115 char *s, *d;
2116
2117 s = strdup(item->source);
2118 if (!s) {
2119 r = -ENOMEM;
2120 goto finish;
2121 }
2122
2123 d = strdup(item->destination);
2124 if (!d) {
2125 free(s);
2126 r = -ENOMEM;
2127 goto finish;
2128 }
2129
2130 bind_mounts[h++] = (BindMount) {
2131 .source = s,
2132 .destination = d,
2133 .read_only = item->read_only,
2134 .recursive = item->recursive,
2135 .ignore_enoent = item->ignore_enoent,
2136 };
2137 }
2138
2139 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2140 char **suffix;
2141
2142 if (!params->prefix[t])
2143 continue;
2144
2145 if (strv_isempty(context->directories[t].paths))
2146 continue;
2147
2148 if (context->dynamic_user &&
2149 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2150 char *private_root;
2151
2152 /* So this is for a dynamic user, and we need to make sure the process can access its own
2153 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2154 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2155
2156 private_root = strjoin(params->prefix[t], "/private");
2157 if (!private_root) {
2158 r = -ENOMEM;
2159 goto finish;
2160 }
2161
2162 r = strv_consume(&empty_directories, private_root);
2163 if (r < 0) {
2164 r = -ENOMEM;
2165 goto finish;
2166 }
2167 }
2168
2169 STRV_FOREACH(suffix, context->directories[t].paths) {
2170 char *s, *d;
2171
2172 if (context->dynamic_user &&
2173 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2174 s = strjoin(params->prefix[t], "/private/", *suffix);
2175 else
2176 s = strjoin(params->prefix[t], "/", *suffix);
2177 if (!s) {
2178 r = -ENOMEM;
2179 goto finish;
2180 }
2181
2182 d = strdup(s);
2183 if (!d) {
2184 free(s);
2185 r = -ENOMEM;
2186 goto finish;
2187 }
2188
2189 bind_mounts[h++] = (BindMount) {
2190 .source = s,
2191 .destination = d,
2192 .read_only = false,
2193 .recursive = true,
2194 .ignore_enoent = false,
2195 };
2196 }
2197 }
2198
2199 assert(h == n);
2200
2201 *ret_bind_mounts = bind_mounts;
2202 *ret_n_bind_mounts = n;
2203 *ret_empty_directories = empty_directories;
2204
2205 empty_directories = NULL;
2206
2207 return (int) n;
2208
2209 finish:
2210 bind_mount_free_many(bind_mounts, h);
2211 return r;
2212 }
2213
2214 static int apply_mount_namespace(
2215 Unit *u,
2216 ExecCommand *command,
2217 const ExecContext *context,
2218 const ExecParameters *params,
2219 ExecRuntime *runtime) {
2220
2221 _cleanup_strv_free_ char **empty_directories = NULL;
2222 char *tmp = NULL, *var = NULL;
2223 const char *root_dir = NULL, *root_image = NULL;
2224 NamespaceInfo ns_info = {
2225 .ignore_protect_paths = false,
2226 .private_dev = context->private_devices,
2227 .protect_control_groups = context->protect_control_groups,
2228 .protect_kernel_tunables = context->protect_kernel_tunables,
2229 .protect_kernel_modules = context->protect_kernel_modules,
2230 .mount_apivfs = context->mount_apivfs,
2231 };
2232 bool needs_sandboxing;
2233 BindMount *bind_mounts = NULL;
2234 unsigned n_bind_mounts = 0;
2235 int r;
2236
2237 assert(context);
2238
2239 /* The runtime struct only contains the parent of the private /tmp,
2240 * which is non-accessible to world users. Inside of it there's a /tmp
2241 * that is sticky, and that's the one we want to use here. */
2242
2243 if (context->private_tmp && runtime) {
2244 if (runtime->tmp_dir)
2245 tmp = strjoina(runtime->tmp_dir, "/tmp");
2246 if (runtime->var_tmp_dir)
2247 var = strjoina(runtime->var_tmp_dir, "/tmp");
2248 }
2249
2250 if (params->flags & EXEC_APPLY_CHROOT) {
2251 root_image = context->root_image;
2252
2253 if (!root_image)
2254 root_dir = context->root_directory;
2255 }
2256
2257 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2258 if (r < 0)
2259 return r;
2260
2261 /*
2262 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2263 * sandbox info, otherwise enforce it, don't ignore protected paths and
2264 * fail if we are enable to apply the sandbox inside the mount namespace.
2265 */
2266 if (!context->dynamic_user && root_dir)
2267 ns_info.ignore_protect_paths = true;
2268
2269 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2270
2271 r = setup_namespace(root_dir, root_image,
2272 &ns_info, context->read_write_paths,
2273 needs_sandboxing ? context->read_only_paths : NULL,
2274 needs_sandboxing ? context->inaccessible_paths : NULL,
2275 empty_directories,
2276 bind_mounts,
2277 n_bind_mounts,
2278 tmp,
2279 var,
2280 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2281 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2282 context->mount_flags,
2283 DISSECT_IMAGE_DISCARD_ON_LOOP);
2284
2285 bind_mount_free_many(bind_mounts, n_bind_mounts);
2286
2287 /* If we couldn't set up the namespace this is probably due to a
2288 * missing capability. In this case, silently proceeed. */
2289 if (IN_SET(r, -EPERM, -EACCES)) {
2290 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2291 return 0;
2292 }
2293
2294 return r;
2295 }
2296
2297 static int apply_working_directory(
2298 const ExecContext *context,
2299 const ExecParameters *params,
2300 const char *home,
2301 const bool needs_mount_ns,
2302 int *exit_status) {
2303
2304 const char *d, *wd;
2305
2306 assert(context);
2307 assert(exit_status);
2308
2309 if (context->working_directory_home) {
2310
2311 if (!home) {
2312 *exit_status = EXIT_CHDIR;
2313 return -ENXIO;
2314 }
2315
2316 wd = home;
2317
2318 } else if (context->working_directory)
2319 wd = context->working_directory;
2320 else
2321 wd = "/";
2322
2323 if (params->flags & EXEC_APPLY_CHROOT) {
2324 if (!needs_mount_ns && context->root_directory)
2325 if (chroot(context->root_directory) < 0) {
2326 *exit_status = EXIT_CHROOT;
2327 return -errno;
2328 }
2329
2330 d = wd;
2331 } else
2332 d = prefix_roota(context->root_directory, wd);
2333
2334 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2335 *exit_status = EXIT_CHDIR;
2336 return -errno;
2337 }
2338
2339 return 0;
2340 }
2341
2342 static int setup_keyring(
2343 Unit *u,
2344 const ExecContext *context,
2345 const ExecParameters *p,
2346 uid_t uid, gid_t gid) {
2347
2348 key_serial_t keyring;
2349 int r;
2350
2351 assert(u);
2352 assert(context);
2353 assert(p);
2354
2355 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2356 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2357 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2358 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2359 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2360 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2361
2362 if (!(p->flags & EXEC_NEW_KEYRING))
2363 return 0;
2364
2365 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2366 return 0;
2367
2368 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2369 if (keyring == -1) {
2370 if (errno == ENOSYS)
2371 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2372 else if (IN_SET(errno, EACCES, EPERM))
2373 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2374 else if (errno == EDQUOT)
2375 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2376 else
2377 return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2378
2379 return 0;
2380 }
2381
2382 /* Populate they keyring with the invocation ID by default. */
2383 if (!sd_id128_is_null(u->invocation_id)) {
2384 key_serial_t key;
2385
2386 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2387 if (key == -1)
2388 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2389 else {
2390 if (keyctl(KEYCTL_SETPERM, key,
2391 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2392 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2393 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2394 }
2395 }
2396
2397 /* And now, make the keyring owned by the service's user */
2398 if (uid_is_valid(uid) || gid_is_valid(gid))
2399 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2400 return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2401
2402 /* When requested link the user keyring into the session keyring. */
2403 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2404 uid_t saved_uid;
2405 gid_t saved_gid;
2406
2407 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2408 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2409 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2410
2411 saved_uid = getuid();
2412 saved_gid = getgid();
2413
2414 if (gid_is_valid(gid) && gid != saved_gid) {
2415 if (setregid(gid, -1) < 0)
2416 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2417 }
2418
2419 if (uid_is_valid(uid) && uid != saved_uid) {
2420 if (setreuid(uid, -1) < 0) {
2421 (void) setregid(saved_gid, -1);
2422 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2423 }
2424 }
2425
2426 if (keyctl(KEYCTL_LINK,
2427 KEY_SPEC_USER_KEYRING,
2428 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2429
2430 r = -errno;
2431
2432 (void) setreuid(saved_uid, -1);
2433 (void) setregid(saved_gid, -1);
2434
2435 return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2436 }
2437
2438 if (uid_is_valid(uid) && uid != saved_uid) {
2439 if (setreuid(saved_uid, -1) < 0) {
2440 (void) setregid(saved_gid, -1);
2441 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2442 }
2443 }
2444
2445 if (gid_is_valid(gid) && gid != saved_gid) {
2446 if (setregid(saved_gid, -1) < 0)
2447 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2448 }
2449 }
2450
2451 return 0;
2452 }
2453
2454 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2455 assert(array);
2456 assert(n);
2457
2458 if (!pair)
2459 return;
2460
2461 if (pair[0] >= 0)
2462 array[(*n)++] = pair[0];
2463 if (pair[1] >= 0)
2464 array[(*n)++] = pair[1];
2465 }
2466
2467 static int close_remaining_fds(
2468 const ExecParameters *params,
2469 ExecRuntime *runtime,
2470 DynamicCreds *dcreds,
2471 int user_lookup_fd,
2472 int socket_fd,
2473 int *fds, unsigned n_fds) {
2474
2475 unsigned n_dont_close = 0;
2476 int dont_close[n_fds + 12];
2477
2478 assert(params);
2479
2480 if (params->stdin_fd >= 0)
2481 dont_close[n_dont_close++] = params->stdin_fd;
2482 if (params->stdout_fd >= 0)
2483 dont_close[n_dont_close++] = params->stdout_fd;
2484 if (params->stderr_fd >= 0)
2485 dont_close[n_dont_close++] = params->stderr_fd;
2486
2487 if (socket_fd >= 0)
2488 dont_close[n_dont_close++] = socket_fd;
2489 if (n_fds > 0) {
2490 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2491 n_dont_close += n_fds;
2492 }
2493
2494 if (runtime)
2495 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2496
2497 if (dcreds) {
2498 if (dcreds->user)
2499 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2500 if (dcreds->group)
2501 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2502 }
2503
2504 if (user_lookup_fd >= 0)
2505 dont_close[n_dont_close++] = user_lookup_fd;
2506
2507 return close_all_fds(dont_close, n_dont_close);
2508 }
2509
2510 static int send_user_lookup(
2511 Unit *unit,
2512 int user_lookup_fd,
2513 uid_t uid,
2514 gid_t gid) {
2515
2516 assert(unit);
2517
2518 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2519 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2520 * specified. */
2521
2522 if (user_lookup_fd < 0)
2523 return 0;
2524
2525 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2526 return 0;
2527
2528 if (writev(user_lookup_fd,
2529 (struct iovec[]) {
2530 IOVEC_INIT(&uid, sizeof(uid)),
2531 IOVEC_INIT(&gid, sizeof(gid)),
2532 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2533 return -errno;
2534
2535 return 0;
2536 }
2537
2538 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2539 int r;
2540
2541 assert(c);
2542 assert(home);
2543 assert(buf);
2544
2545 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2546
2547 if (*home)
2548 return 0;
2549
2550 if (!c->working_directory_home)
2551 return 0;
2552
2553 if (uid == 0) {
2554 /* Hardcode /root as home directory for UID 0 */
2555 *home = "/root";
2556 return 1;
2557 }
2558
2559 r = get_home_dir(buf);
2560 if (r < 0)
2561 return r;
2562
2563 *home = *buf;
2564 return 1;
2565 }
2566
2567 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2568 _cleanup_strv_free_ char ** list = NULL;
2569 ExecDirectoryType t;
2570 int r;
2571
2572 assert(c);
2573 assert(p);
2574 assert(ret);
2575
2576 assert(c->dynamic_user);
2577
2578 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2579 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2580 * directories. */
2581
2582 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2583 char **i;
2584
2585 if (t == EXEC_DIRECTORY_CONFIGURATION)
2586 continue;
2587
2588 if (!p->prefix[t])
2589 continue;
2590
2591 STRV_FOREACH(i, c->directories[t].paths) {
2592 char *e;
2593
2594 if (t == EXEC_DIRECTORY_RUNTIME)
2595 e = strjoin(p->prefix[t], "/", *i);
2596 else
2597 e = strjoin(p->prefix[t], "/private/", *i);
2598 if (!e)
2599 return -ENOMEM;
2600
2601 r = strv_consume(&list, e);
2602 if (r < 0)
2603 return r;
2604 }
2605 }
2606
2607 *ret = list;
2608 list = NULL;
2609
2610 return 0;
2611 }
2612
2613 static int exec_child(
2614 Unit *unit,
2615 ExecCommand *command,
2616 const ExecContext *context,
2617 const ExecParameters *params,
2618 ExecRuntime *runtime,
2619 DynamicCreds *dcreds,
2620 char **argv,
2621 int socket_fd,
2622 int named_iofds[3],
2623 int *fds,
2624 unsigned n_storage_fds,
2625 unsigned n_socket_fds,
2626 char **files_env,
2627 int user_lookup_fd,
2628 int *exit_status) {
2629
2630 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2631 _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2632 _cleanup_free_ gid_t *supplementary_gids = NULL;
2633 const char *username = NULL, *groupname = NULL;
2634 const char *home = NULL, *shell = NULL;
2635 dev_t journal_stream_dev = 0;
2636 ino_t journal_stream_ino = 0;
2637 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2638 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2639 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2640 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2641 #if HAVE_SELINUX
2642 bool use_selinux = false;
2643 #endif
2644 #if ENABLE_SMACK
2645 bool use_smack = false;
2646 #endif
2647 #if HAVE_APPARMOR
2648 bool use_apparmor = false;
2649 #endif
2650 uid_t uid = UID_INVALID;
2651 gid_t gid = GID_INVALID;
2652 int i, r, ngids = 0;
2653 unsigned n_fds;
2654 ExecDirectoryType dt;
2655 int secure_bits;
2656
2657 assert(unit);
2658 assert(command);
2659 assert(context);
2660 assert(params);
2661 assert(exit_status);
2662
2663 rename_process_from_path(command->path);
2664
2665 /* We reset exactly these signals, since they are the
2666 * only ones we set to SIG_IGN in the main daemon. All
2667 * others we leave untouched because we set them to
2668 * SIG_DFL or a valid handler initially, both of which
2669 * will be demoted to SIG_DFL. */
2670 (void) default_signals(SIGNALS_CRASH_HANDLER,
2671 SIGNALS_IGNORE, -1);
2672
2673 if (context->ignore_sigpipe)
2674 (void) ignore_signals(SIGPIPE, -1);
2675
2676 r = reset_signal_mask();
2677 if (r < 0) {
2678 *exit_status = EXIT_SIGNAL_MASK;
2679 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2680 }
2681
2682 if (params->idle_pipe)
2683 do_idle_pipe_dance(params->idle_pipe);
2684
2685 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2686 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2687 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2688 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2689
2690 log_forget_fds();
2691 log_set_open_when_needed(true);
2692
2693 /* In case anything used libc syslog(), close this here, too */
2694 closelog();
2695
2696 n_fds = n_storage_fds + n_socket_fds;
2697 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2698 if (r < 0) {
2699 *exit_status = EXIT_FDS;
2700 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2701 }
2702
2703 if (!context->same_pgrp)
2704 if (setsid() < 0) {
2705 *exit_status = EXIT_SETSID;
2706 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2707 }
2708
2709 exec_context_tty_reset(context, params);
2710
2711 if (unit_shall_confirm_spawn(unit)) {
2712 const char *vc = params->confirm_spawn;
2713 _cleanup_free_ char *cmdline = NULL;
2714
2715 cmdline = exec_command_line(argv);
2716 if (!cmdline) {
2717 *exit_status = EXIT_MEMORY;
2718 return log_oom();
2719 }
2720
2721 r = ask_for_confirmation(vc, unit, cmdline);
2722 if (r != CONFIRM_EXECUTE) {
2723 if (r == CONFIRM_PRETEND_SUCCESS) {
2724 *exit_status = EXIT_SUCCESS;
2725 return 0;
2726 }
2727 *exit_status = EXIT_CONFIRM;
2728 log_unit_error(unit, "Execution cancelled by the user");
2729 return -ECANCELED;
2730 }
2731 }
2732
2733 if (context->dynamic_user && dcreds) {
2734 _cleanup_strv_free_ char **suggested_paths = NULL;
2735
2736 /* Make sure we bypass our own NSS module for any NSS checks */
2737 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2738 *exit_status = EXIT_USER;
2739 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2740 }
2741
2742 r = compile_suggested_paths(context, params, &suggested_paths);
2743 if (r < 0) {
2744 *exit_status = EXIT_MEMORY;
2745 return log_oom();
2746 }
2747
2748 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2749 if (r < 0) {
2750 *exit_status = EXIT_USER;
2751 if (r == -EILSEQ) {
2752 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2753 return -EOPNOTSUPP;
2754 }
2755 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2756 }
2757
2758 if (!uid_is_valid(uid)) {
2759 *exit_status = EXIT_USER;
2760 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2761 return -ESRCH;
2762 }
2763
2764 if (!gid_is_valid(gid)) {
2765 *exit_status = EXIT_USER;
2766 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2767 return -ESRCH;
2768 }
2769
2770 if (dcreds->user)
2771 username = dcreds->user->name;
2772
2773 } else {
2774 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2775 if (r < 0) {
2776 *exit_status = EXIT_USER;
2777 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2778 }
2779
2780 r = get_fixed_group(context, &groupname, &gid);
2781 if (r < 0) {
2782 *exit_status = EXIT_GROUP;
2783 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2784 }
2785 }
2786
2787 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2788 r = get_supplementary_groups(context, username, groupname, gid,
2789 &supplementary_gids, &ngids);
2790 if (r < 0) {
2791 *exit_status = EXIT_GROUP;
2792 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2793 }
2794
2795 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2796 if (r < 0) {
2797 *exit_status = EXIT_USER;
2798 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2799 }
2800
2801 user_lookup_fd = safe_close(user_lookup_fd);
2802
2803 r = acquire_home(context, uid, &home, &home_buffer);
2804 if (r < 0) {
2805 *exit_status = EXIT_CHDIR;
2806 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2807 }
2808
2809 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2810 * must sure to drop O_NONBLOCK */
2811 if (socket_fd >= 0)
2812 (void) fd_nonblock(socket_fd, false);
2813
2814 r = setup_input(context, params, socket_fd, named_iofds);
2815 if (r < 0) {
2816 *exit_status = EXIT_STDIN;
2817 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2818 }
2819
2820 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2821 if (r < 0) {
2822 *exit_status = EXIT_STDOUT;
2823 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2824 }
2825
2826 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2827 if (r < 0) {
2828 *exit_status = EXIT_STDERR;
2829 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2830 }
2831
2832 if (params->cgroup_path) {
2833 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2834 if (r < 0) {
2835 *exit_status = EXIT_CGROUP;
2836 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2837 }
2838 }
2839
2840 if (context->oom_score_adjust_set) {
2841 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2842
2843 /* When we can't make this change due to EPERM, then
2844 * let's silently skip over it. User namespaces
2845 * prohibit write access to this file, and we
2846 * shouldn't trip up over that. */
2847
2848 sprintf(t, "%i", context->oom_score_adjust);
2849 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2850 if (IN_SET(r, -EPERM, -EACCES))
2851 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2852 else if (r < 0) {
2853 *exit_status = EXIT_OOM_ADJUST;
2854 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2855 }
2856 }
2857
2858 if (context->nice_set)
2859 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2860 *exit_status = EXIT_NICE;
2861 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2862 }
2863
2864 if (context->cpu_sched_set) {
2865 struct sched_param param = {
2866 .sched_priority = context->cpu_sched_priority,
2867 };
2868
2869 r = sched_setscheduler(0,
2870 context->cpu_sched_policy |
2871 (context->cpu_sched_reset_on_fork ?
2872 SCHED_RESET_ON_FORK : 0),
2873 &param);
2874 if (r < 0) {
2875 *exit_status = EXIT_SETSCHEDULER;
2876 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2877 }
2878 }
2879
2880 if (context->cpuset)
2881 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2882 *exit_status = EXIT_CPUAFFINITY;
2883 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2884 }
2885
2886 if (context->ioprio_set)
2887 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2888 *exit_status = EXIT_IOPRIO;
2889 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2890 }
2891
2892 if (context->timer_slack_nsec != NSEC_INFINITY)
2893 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2894 *exit_status = EXIT_TIMERSLACK;
2895 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2896 }
2897
2898 if (context->personality != PERSONALITY_INVALID) {
2899 r = safe_personality(context->personality);
2900 if (r < 0) {
2901 *exit_status = EXIT_PERSONALITY;
2902 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2903 }
2904 }
2905
2906 if (context->utmp_id)
2907 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2908 context->tty_path,
2909 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
2910 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2911 USER_PROCESS,
2912 username);
2913
2914 if (context->user) {
2915 r = chown_terminal(STDIN_FILENO, uid);
2916 if (r < 0) {
2917 *exit_status = EXIT_STDIN;
2918 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2919 }
2920 }
2921
2922 /* If delegation is enabled we'll pass ownership of the cgroup
2923 * (but only in systemd's own controller hierarchy!) to the
2924 * user of the new process. */
2925 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2926 r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
2927 if (r < 0) {
2928 *exit_status = EXIT_CGROUP;
2929 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2930 }
2931
2932 r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
2933 if (r < 0) {
2934 *exit_status = EXIT_CGROUP;
2935 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2936 }
2937 }
2938
2939 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2940 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
2941 if (r < 0)
2942 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
2943 }
2944
2945 r = build_environment(
2946 unit,
2947 context,
2948 params,
2949 n_fds,
2950 home,
2951 username,
2952 shell,
2953 journal_stream_dev,
2954 journal_stream_ino,
2955 &our_env);
2956 if (r < 0) {
2957 *exit_status = EXIT_MEMORY;
2958 return log_oom();
2959 }
2960
2961 r = build_pass_environment(context, &pass_env);
2962 if (r < 0) {
2963 *exit_status = EXIT_MEMORY;
2964 return log_oom();
2965 }
2966
2967 accum_env = strv_env_merge(5,
2968 params->environment,
2969 our_env,
2970 pass_env,
2971 context->environment,
2972 files_env,
2973 NULL);
2974 if (!accum_env) {
2975 *exit_status = EXIT_MEMORY;
2976 return log_oom();
2977 }
2978 accum_env = strv_env_clean(accum_env);
2979
2980 (void) umask(context->umask);
2981
2982 r = setup_keyring(unit, context, params, uid, gid);
2983 if (r < 0) {
2984 *exit_status = EXIT_KEYRING;
2985 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
2986 }
2987
2988 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2989 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2990
2991 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2992 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
2993
2994 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2995 if (needs_ambient_hack)
2996 needs_setuid = false;
2997 else
2998 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
2999
3000 if (needs_sandboxing) {
3001 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3002 * present. The actual MAC context application will happen later, as late as possible, to avoid
3003 * impacting our own code paths. */
3004
3005 #if HAVE_SELINUX
3006 use_selinux = mac_selinux_use();
3007 #endif
3008 #if ENABLE_SMACK
3009 use_smack = mac_smack_use();
3010 #endif
3011 #if HAVE_APPARMOR
3012 use_apparmor = mac_apparmor_use();
3013 #endif
3014 }
3015
3016 if (needs_setuid) {
3017 if (context->pam_name && username) {
3018 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3019 if (r < 0) {
3020 *exit_status = EXIT_PAM;
3021 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3022 }
3023 }
3024 }
3025
3026 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3027 if (ns_type_supported(NAMESPACE_NET)) {
3028 r = setup_netns(runtime->netns_storage_socket);
3029 if (r < 0) {
3030 *exit_status = EXIT_NETWORK;
3031 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3032 }
3033 } else
3034 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3035 }
3036
3037 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3038 if (needs_mount_namespace) {
3039 r = apply_mount_namespace(unit, command, context, params, runtime);
3040 if (r < 0) {
3041 *exit_status = EXIT_NAMESPACE;
3042 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3043 }
3044 }
3045
3046 /* Apply just after mount namespace setup */
3047 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3048 if (r < 0)
3049 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3050
3051 /* Drop groups as early as possbile */
3052 if (needs_setuid) {
3053 r = enforce_groups(gid, supplementary_gids, ngids);
3054 if (r < 0) {
3055 *exit_status = EXIT_GROUP;
3056 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3057 }
3058 }
3059
3060 if (needs_sandboxing) {
3061 #if HAVE_SELINUX
3062 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3063 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3064 if (r < 0) {
3065 *exit_status = EXIT_SELINUX_CONTEXT;
3066 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3067 }
3068 }
3069 #endif
3070
3071 if (context->private_users) {
3072 r = setup_private_users(uid, gid);
3073 if (r < 0) {
3074 *exit_status = EXIT_USER;
3075 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3076 }
3077 }
3078 }
3079
3080 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3081 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3082 * was needed to upload the policy and can now be closed as well. */
3083 r = close_all_fds(fds, n_fds);
3084 if (r >= 0)
3085 r = shift_fds(fds, n_fds);
3086 if (r >= 0)
3087 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3088 if (r < 0) {
3089 *exit_status = EXIT_FDS;
3090 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3091 }
3092
3093 secure_bits = context->secure_bits;
3094
3095 if (needs_sandboxing) {
3096 uint64_t bset;
3097
3098 for (i = 0; i < _RLIMIT_MAX; i++) {
3099
3100 if (!context->rlimit[i])
3101 continue;
3102
3103 r = setrlimit_closest(i, context->rlimit[i]);
3104 if (r < 0) {
3105 *exit_status = EXIT_LIMITS;
3106 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3107 }
3108 }
3109
3110 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3111 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3112 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3113 *exit_status = EXIT_LIMITS;
3114 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3115 }
3116 }
3117
3118 bset = context->capability_bounding_set;
3119 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3120 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3121 * instead of us doing that */
3122 if (needs_ambient_hack)
3123 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3124 (UINT64_C(1) << CAP_SETUID) |
3125 (UINT64_C(1) << CAP_SETGID);
3126
3127 if (!cap_test_all(bset)) {
3128 r = capability_bounding_set_drop(bset, false);
3129 if (r < 0) {
3130 *exit_status = EXIT_CAPABILITIES;
3131 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3132 }
3133 }
3134
3135 /* This is done before enforce_user, but ambient set
3136 * does not survive over setresuid() if keep_caps is not set. */
3137 if (!needs_ambient_hack &&
3138 context->capability_ambient_set != 0) {
3139 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3140 if (r < 0) {
3141 *exit_status = EXIT_CAPABILITIES;
3142 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3143 }
3144 }
3145 }
3146
3147 if (needs_setuid) {
3148 if (context->user) {
3149 r = enforce_user(context, uid);
3150 if (r < 0) {
3151 *exit_status = EXIT_USER;
3152 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3153 }
3154
3155 if (!needs_ambient_hack &&
3156 context->capability_ambient_set != 0) {
3157
3158 /* Fix the ambient capabilities after user change. */
3159 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3160 if (r < 0) {
3161 *exit_status = EXIT_CAPABILITIES;
3162 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3163 }
3164
3165 /* If we were asked to change user and ambient capabilities
3166 * were requested, we had to add keep-caps to the securebits
3167 * so that we would maintain the inherited capability set
3168 * through the setresuid(). Make sure that the bit is added
3169 * also to the context secure_bits so that we don't try to
3170 * drop the bit away next. */
3171
3172 secure_bits |= 1<<SECURE_KEEP_CAPS;
3173 }
3174 }
3175 }
3176
3177 if (needs_sandboxing) {
3178 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3179 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3180 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3181 * are restricted. */
3182
3183 #if HAVE_SELINUX
3184 if (use_selinux) {
3185 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3186
3187 if (exec_context) {
3188 r = setexeccon(exec_context);
3189 if (r < 0) {
3190 *exit_status = EXIT_SELINUX_CONTEXT;
3191 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3192 }
3193 }
3194 }
3195 #endif
3196
3197 #if ENABLE_SMACK
3198 if (use_smack) {
3199 r = setup_smack(context, command);
3200 if (r < 0) {
3201 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3202 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3203 }
3204 }
3205 #endif
3206
3207 #if HAVE_APPARMOR
3208 if (use_apparmor && context->apparmor_profile) {
3209 r = aa_change_onexec(context->apparmor_profile);
3210 if (r < 0 && !context->apparmor_profile_ignore) {
3211 *exit_status = EXIT_APPARMOR_PROFILE;
3212 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3213 }
3214 }
3215 #endif
3216
3217 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3218 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3219 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3220 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3221 *exit_status = EXIT_SECUREBITS;
3222 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3223 }
3224
3225 if (context_has_no_new_privileges(context))
3226 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3227 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3228 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3229 }
3230
3231 #if HAVE_SECCOMP
3232 r = apply_address_families(unit, context);
3233 if (r < 0) {
3234 *exit_status = EXIT_ADDRESS_FAMILIES;
3235 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3236 }
3237
3238 r = apply_memory_deny_write_execute(unit, context);
3239 if (r < 0) {
3240 *exit_status = EXIT_SECCOMP;
3241 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3242 }
3243
3244 r = apply_restrict_realtime(unit, context);
3245 if (r < 0) {
3246 *exit_status = EXIT_SECCOMP;
3247 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3248 }
3249
3250 r = apply_restrict_namespaces(unit, context);
3251 if (r < 0) {
3252 *exit_status = EXIT_SECCOMP;
3253 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3254 }
3255
3256 r = apply_protect_sysctl(unit, context);
3257 if (r < 0) {
3258 *exit_status = EXIT_SECCOMP;
3259 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3260 }
3261
3262 r = apply_protect_kernel_modules(unit, context);
3263 if (r < 0) {
3264 *exit_status = EXIT_SECCOMP;
3265 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3266 }
3267
3268 r = apply_private_devices(unit, context);
3269 if (r < 0) {
3270 *exit_status = EXIT_SECCOMP;
3271 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3272 }
3273
3274 r = apply_syscall_archs(unit, context);
3275 if (r < 0) {
3276 *exit_status = EXIT_SECCOMP;
3277 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3278 }
3279
3280 r = apply_lock_personality(unit, context);
3281 if (r < 0) {
3282 *exit_status = EXIT_SECCOMP;
3283 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3284 }
3285
3286 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3287 * by the filter as little as possible. */
3288 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3289 if (r < 0) {
3290 *exit_status = EXIT_SECCOMP;
3291 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3292 }
3293 #endif
3294 }
3295
3296 if (!strv_isempty(context->unset_environment)) {
3297 char **ee = NULL;
3298
3299 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3300 if (!ee) {
3301 *exit_status = EXIT_MEMORY;
3302 return log_oom();
3303 }
3304
3305 strv_free(accum_env);
3306 accum_env = ee;
3307 }
3308
3309 final_argv = replace_env_argv(argv, accum_env);
3310 if (!final_argv) {
3311 *exit_status = EXIT_MEMORY;
3312 return log_oom();
3313 }
3314
3315 if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3316 _cleanup_free_ char *line;
3317
3318 line = exec_command_line(final_argv);
3319 if (line) {
3320 log_struct(LOG_DEBUG,
3321 "EXECUTABLE=%s", command->path,
3322 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3323 LOG_UNIT_ID(unit),
3324 LOG_UNIT_INVOCATION_ID(unit),
3325 NULL);
3326 }
3327 }
3328
3329 execve(command->path, final_argv, accum_env);
3330
3331 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3332
3333 log_struct_errno(LOG_INFO, errno,
3334 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3335 LOG_UNIT_ID(unit),
3336 LOG_UNIT_INVOCATION_ID(unit),
3337 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3338 command->path),
3339 "EXECUTABLE=%s", command->path,
3340 NULL);
3341
3342 return 0;
3343 }
3344
3345 *exit_status = EXIT_EXEC;
3346 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3347 }
3348
3349 int exec_spawn(Unit *unit,
3350 ExecCommand *command,
3351 const ExecContext *context,
3352 const ExecParameters *params,
3353 ExecRuntime *runtime,
3354 DynamicCreds *dcreds,
3355 pid_t *ret) {
3356
3357 _cleanup_strv_free_ char **files_env = NULL;
3358 int *fds = NULL;
3359 unsigned n_storage_fds = 0, n_socket_fds = 0;
3360 _cleanup_free_ char *line = NULL;
3361 int socket_fd, r;
3362 int named_iofds[3] = { -1, -1, -1 };
3363 char **argv;
3364 pid_t pid;
3365
3366 assert(unit);
3367 assert(command);
3368 assert(context);
3369 assert(ret);
3370 assert(params);
3371 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3372
3373 if (context->std_input == EXEC_INPUT_SOCKET ||
3374 context->std_output == EXEC_OUTPUT_SOCKET ||
3375 context->std_error == EXEC_OUTPUT_SOCKET) {
3376
3377 if (params->n_socket_fds > 1) {
3378 log_unit_error(unit, "Got more than one socket.");
3379 return -EINVAL;
3380 }
3381
3382 if (params->n_socket_fds == 0) {
3383 log_unit_error(unit, "Got no socket.");
3384 return -EINVAL;
3385 }
3386
3387 socket_fd = params->fds[0];
3388 } else {
3389 socket_fd = -1;
3390 fds = params->fds;
3391 n_storage_fds = params->n_storage_fds;
3392 n_socket_fds = params->n_socket_fds;
3393 }
3394
3395 r = exec_context_named_iofds(unit, context, params, named_iofds);
3396 if (r < 0)
3397 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3398
3399 r = exec_context_load_environment(unit, context, &files_env);
3400 if (r < 0)
3401 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3402
3403 argv = params->argv ?: command->argv;
3404 line = exec_command_line(argv);
3405 if (!line)
3406 return log_oom();
3407
3408 log_struct(LOG_DEBUG,
3409 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3410 "EXECUTABLE=%s", command->path,
3411 LOG_UNIT_ID(unit),
3412 LOG_UNIT_INVOCATION_ID(unit),
3413 NULL);
3414
3415 pid = fork();
3416 if (pid < 0)
3417 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3418
3419 if (pid == 0) {
3420 int exit_status = EXIT_SUCCESS;
3421
3422 r = exec_child(unit,
3423 command,
3424 context,
3425 params,
3426 runtime,
3427 dcreds,
3428 argv,
3429 socket_fd,
3430 named_iofds,
3431 fds,
3432 n_storage_fds,
3433 n_socket_fds,
3434 files_env,
3435 unit->manager->user_lookup_fds[1],
3436 &exit_status);
3437
3438 if (r < 0) {
3439 log_struct_errno(LOG_ERR, r,
3440 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3441 LOG_UNIT_ID(unit),
3442 LOG_UNIT_INVOCATION_ID(unit),
3443 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3444 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3445 command->path),
3446 "EXECUTABLE=%s", command->path,
3447 NULL);
3448 }
3449
3450 _exit(exit_status);
3451 }
3452
3453 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3454
3455 /* We add the new process to the cgroup both in the child (so
3456 * that we can be sure that no user code is ever executed
3457 * outside of the cgroup) and in the parent (so that we can be
3458 * sure that when we kill the cgroup the process will be
3459 * killed too). */
3460 if (params->cgroup_path)
3461 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3462
3463 exec_status_start(&command->exec_status, pid);
3464
3465 *ret = pid;
3466 return 0;
3467 }
3468
3469 void exec_context_init(ExecContext *c) {
3470 ExecDirectoryType i;
3471
3472 assert(c);
3473
3474 c->umask = 0022;
3475 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3476 c->cpu_sched_policy = SCHED_OTHER;
3477 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3478 c->syslog_level_prefix = true;
3479 c->ignore_sigpipe = true;
3480 c->timer_slack_nsec = NSEC_INFINITY;
3481 c->personality = PERSONALITY_INVALID;
3482 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3483 c->directories[i].mode = 0755;
3484 c->capability_bounding_set = CAP_ALL;
3485 c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3486 c->log_level_max = -1;
3487 }
3488
3489 void exec_context_done(ExecContext *c) {
3490 ExecDirectoryType i;
3491 size_t l;
3492
3493 assert(c);
3494
3495 c->environment = strv_free(c->environment);
3496 c->environment_files = strv_free(c->environment_files);
3497 c->pass_environment = strv_free(c->pass_environment);
3498 c->unset_environment = strv_free(c->unset_environment);
3499
3500 for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3501 c->rlimit[l] = mfree(c->rlimit[l]);
3502
3503 for (l = 0; l < 3; l++)
3504 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3505
3506 c->working_directory = mfree(c->working_directory);
3507 c->root_directory = mfree(c->root_directory);
3508 c->root_image = mfree(c->root_image);
3509 c->tty_path = mfree(c->tty_path);
3510 c->syslog_identifier = mfree(c->syslog_identifier);
3511 c->user = mfree(c->user);
3512 c->group = mfree(c->group);
3513
3514 c->supplementary_groups = strv_free(c->supplementary_groups);
3515
3516 c->pam_name = mfree(c->pam_name);
3517
3518 c->read_only_paths = strv_free(c->read_only_paths);
3519 c->read_write_paths = strv_free(c->read_write_paths);
3520 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3521
3522 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3523
3524 if (c->cpuset)
3525 CPU_FREE(c->cpuset);
3526
3527 c->utmp_id = mfree(c->utmp_id);
3528 c->selinux_context = mfree(c->selinux_context);
3529 c->apparmor_profile = mfree(c->apparmor_profile);
3530 c->smack_process_label = mfree(c->smack_process_label);
3531
3532 c->syscall_filter = hashmap_free(c->syscall_filter);
3533 c->syscall_archs = set_free(c->syscall_archs);
3534 c->address_families = set_free(c->address_families);
3535
3536 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3537 c->directories[i].paths = strv_free(c->directories[i].paths);
3538
3539 c->log_level_max = -1;
3540
3541 exec_context_free_log_extra_fields(c);
3542 }
3543
3544 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3545 char **i;
3546
3547 assert(c);
3548
3549 if (!runtime_prefix)
3550 return 0;
3551
3552 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3553 _cleanup_free_ char *p;
3554
3555 p = strjoin(runtime_prefix, "/", *i);
3556 if (!p)
3557 return -ENOMEM;
3558
3559 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3560 * next. */
3561 (void) rm_rf(p, REMOVE_ROOT);
3562 }
3563
3564 return 0;
3565 }
3566
3567 void exec_command_done(ExecCommand *c) {
3568 assert(c);
3569
3570 c->path = mfree(c->path);
3571
3572 c->argv = strv_free(c->argv);
3573 }
3574
3575 void exec_command_done_array(ExecCommand *c, unsigned n) {
3576 unsigned i;
3577
3578 for (i = 0; i < n; i++)
3579 exec_command_done(c+i);
3580 }
3581
3582 ExecCommand* exec_command_free_list(ExecCommand *c) {
3583 ExecCommand *i;
3584
3585 while ((i = c)) {
3586 LIST_REMOVE(command, c, i);
3587 exec_command_done(i);
3588 free(i);
3589 }
3590
3591 return NULL;
3592 }
3593
3594 void exec_command_free_array(ExecCommand **c, unsigned n) {
3595 unsigned i;
3596
3597 for (i = 0; i < n; i++)
3598 c[i] = exec_command_free_list(c[i]);
3599 }
3600
3601 typedef struct InvalidEnvInfo {
3602 Unit *unit;
3603 const char *path;
3604 } InvalidEnvInfo;
3605
3606 static void invalid_env(const char *p, void *userdata) {
3607 InvalidEnvInfo *info = userdata;
3608
3609 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3610 }
3611
3612 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3613 assert(c);
3614
3615 switch (fd_index) {
3616 case STDIN_FILENO:
3617 if (c->std_input != EXEC_INPUT_NAMED_FD)
3618 return NULL;
3619 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3620 case STDOUT_FILENO:
3621 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3622 return NULL;
3623 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3624 case STDERR_FILENO:
3625 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3626 return NULL;
3627 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3628 default:
3629 return NULL;
3630 }
3631 }
3632
3633 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3634 unsigned i, targets;
3635 const char* stdio_fdname[3];
3636 unsigned n_fds;
3637
3638 assert(c);
3639 assert(p);
3640
3641 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3642 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3643 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3644
3645 for (i = 0; i < 3; i++)
3646 stdio_fdname[i] = exec_context_fdname(c, i);
3647
3648 n_fds = p->n_storage_fds + p->n_socket_fds;
3649
3650 for (i = 0; i < n_fds && targets > 0; i++)
3651 if (named_iofds[STDIN_FILENO] < 0 &&
3652 c->std_input == EXEC_INPUT_NAMED_FD &&
3653 stdio_fdname[STDIN_FILENO] &&
3654 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3655
3656 named_iofds[STDIN_FILENO] = p->fds[i];
3657 targets--;
3658
3659 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3660 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3661 stdio_fdname[STDOUT_FILENO] &&
3662 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3663
3664 named_iofds[STDOUT_FILENO] = p->fds[i];
3665 targets--;
3666
3667 } else if (named_iofds[STDERR_FILENO] < 0 &&
3668 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3669 stdio_fdname[STDERR_FILENO] &&
3670 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3671
3672 named_iofds[STDERR_FILENO] = p->fds[i];
3673 targets--;
3674 }
3675
3676 return targets == 0 ? 0 : -ENOENT;
3677 }
3678
3679 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3680 char **i, **r = NULL;
3681
3682 assert(c);
3683 assert(l);
3684
3685 STRV_FOREACH(i, c->environment_files) {
3686 char *fn;
3687 int k;
3688 unsigned n;
3689 bool ignore = false;
3690 char **p;
3691 _cleanup_globfree_ glob_t pglob = {};
3692
3693 fn = *i;
3694
3695 if (fn[0] == '-') {
3696 ignore = true;
3697 fn++;
3698 }
3699
3700 if (!path_is_absolute(fn)) {
3701 if (ignore)
3702 continue;
3703
3704 strv_free(r);
3705 return -EINVAL;
3706 }
3707
3708 /* Filename supports globbing, take all matching files */
3709 k = safe_glob(fn, 0, &pglob);
3710 if (k < 0) {
3711 if (ignore)
3712 continue;
3713
3714 strv_free(r);
3715 return k;
3716 }
3717
3718 /* When we don't match anything, -ENOENT should be returned */
3719 assert(pglob.gl_pathc > 0);
3720
3721 for (n = 0; n < pglob.gl_pathc; n++) {
3722 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3723 if (k < 0) {
3724 if (ignore)
3725 continue;
3726
3727 strv_free(r);
3728 return k;
3729 }
3730 /* Log invalid environment variables with filename */
3731 if (p) {
3732 InvalidEnvInfo info = {
3733 .unit = unit,
3734 .path = pglob.gl_pathv[n]
3735 };
3736
3737 p = strv_env_clean_with_callback(p, invalid_env, &info);
3738 }
3739
3740 if (r == NULL)
3741 r = p;
3742 else {
3743 char **m;
3744
3745 m = strv_env_merge(2, r, p);
3746 strv_free(r);
3747 strv_free(p);
3748 if (!m)
3749 return -ENOMEM;
3750
3751 r = m;
3752 }
3753 }
3754 }
3755
3756 *l = r;
3757
3758 return 0;
3759 }
3760
3761 static bool tty_may_match_dev_console(const char *tty) {
3762 _cleanup_free_ char *active = NULL;
3763 char *console;
3764
3765 if (!tty)
3766 return true;
3767
3768 tty = skip_dev_prefix(tty);
3769
3770 /* trivial identity? */
3771 if (streq(tty, "console"))
3772 return true;
3773
3774 console = resolve_dev_console(&active);
3775 /* if we could not resolve, assume it may */
3776 if (!console)
3777 return true;
3778
3779 /* "tty0" means the active VC, so it may be the same sometimes */
3780 return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3781 }
3782
3783 bool exec_context_may_touch_console(ExecContext *ec) {
3784
3785 return (ec->tty_reset ||
3786 ec->tty_vhangup ||
3787 ec->tty_vt_disallocate ||
3788 is_terminal_input(ec->std_input) ||
3789 is_terminal_output(ec->std_output) ||
3790 is_terminal_output(ec->std_error)) &&
3791 tty_may_match_dev_console(exec_context_tty_path(ec));
3792 }
3793
3794 static void strv_fprintf(FILE *f, char **l) {
3795 char **g;
3796
3797 assert(f);
3798
3799 STRV_FOREACH(g, l)
3800 fprintf(f, " %s", *g);
3801 }
3802
3803 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3804 ExecDirectoryType dt;
3805 char **e, **d;
3806 unsigned i;
3807 int r;
3808
3809 assert(c);
3810 assert(f);
3811
3812 prefix = strempty(prefix);
3813
3814 fprintf(f,
3815 "%sUMask: %04o\n"
3816 "%sWorkingDirectory: %s\n"
3817 "%sRootDirectory: %s\n"
3818 "%sNonBlocking: %s\n"
3819 "%sPrivateTmp: %s\n"
3820 "%sPrivateDevices: %s\n"
3821 "%sProtectKernelTunables: %s\n"
3822 "%sProtectKernelModules: %s\n"
3823 "%sProtectControlGroups: %s\n"
3824 "%sPrivateNetwork: %s\n"
3825 "%sPrivateUsers: %s\n"
3826 "%sProtectHome: %s\n"
3827 "%sProtectSystem: %s\n"
3828 "%sMountAPIVFS: %s\n"
3829 "%sIgnoreSIGPIPE: %s\n"
3830 "%sMemoryDenyWriteExecute: %s\n"
3831 "%sRestrictRealtime: %s\n"
3832 "%sKeyringMode: %s\n",
3833 prefix, c->umask,
3834 prefix, c->working_directory ? c->working_directory : "/",
3835 prefix, c->root_directory ? c->root_directory : "/",
3836 prefix, yes_no(c->non_blocking),
3837 prefix, yes_no(c->private_tmp),
3838 prefix, yes_no(c->private_devices),
3839 prefix, yes_no(c->protect_kernel_tunables),
3840 prefix, yes_no(c->protect_kernel_modules),
3841 prefix, yes_no(c->protect_control_groups),
3842 prefix, yes_no(c->private_network),
3843 prefix, yes_no(c->private_users),
3844 prefix, protect_home_to_string(c->protect_home),
3845 prefix, protect_system_to_string(c->protect_system),
3846 prefix, yes_no(c->mount_apivfs),
3847 prefix, yes_no(c->ignore_sigpipe),
3848 prefix, yes_no(c->memory_deny_write_execute),
3849 prefix, yes_no(c->restrict_realtime),
3850 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3851
3852 if (c->root_image)
3853 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3854
3855 STRV_FOREACH(e, c->environment)
3856 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3857
3858 STRV_FOREACH(e, c->environment_files)
3859 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3860
3861 STRV_FOREACH(e, c->pass_environment)
3862 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3863
3864 STRV_FOREACH(e, c->unset_environment)
3865 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3866
3867 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3868
3869 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3870 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3871
3872 STRV_FOREACH(d, c->directories[dt].paths)
3873 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3874 }
3875
3876 if (c->nice_set)
3877 fprintf(f,
3878 "%sNice: %i\n",
3879 prefix, c->nice);
3880
3881 if (c->oom_score_adjust_set)
3882 fprintf(f,
3883 "%sOOMScoreAdjust: %i\n",
3884 prefix, c->oom_score_adjust);
3885
3886 for (i = 0; i < RLIM_NLIMITS; i++)
3887 if (c->rlimit[i]) {
3888 fprintf(f, "%s%s: " RLIM_FMT "\n",
3889 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3890 fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3891 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3892 }
3893
3894 if (c->ioprio_set) {
3895 _cleanup_free_ char *class_str = NULL;
3896
3897 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3898 if (r >= 0)
3899 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3900
3901 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3902 }
3903
3904 if (c->cpu_sched_set) {
3905 _cleanup_free_ char *policy_str = NULL;
3906
3907 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3908 if (r >= 0)
3909 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3910
3911 fprintf(f,
3912 "%sCPUSchedulingPriority: %i\n"
3913 "%sCPUSchedulingResetOnFork: %s\n",
3914 prefix, c->cpu_sched_priority,
3915 prefix, yes_no(c->cpu_sched_reset_on_fork));
3916 }
3917
3918 if (c->cpuset) {
3919 fprintf(f, "%sCPUAffinity:", prefix);
3920 for (i = 0; i < c->cpuset_ncpus; i++)
3921 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
3922 fprintf(f, " %u", i);
3923 fputs("\n", f);
3924 }
3925
3926 if (c->timer_slack_nsec != NSEC_INFINITY)
3927 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
3928
3929 fprintf(f,
3930 "%sStandardInput: %s\n"
3931 "%sStandardOutput: %s\n"
3932 "%sStandardError: %s\n",
3933 prefix, exec_input_to_string(c->std_input),
3934 prefix, exec_output_to_string(c->std_output),
3935 prefix, exec_output_to_string(c->std_error));
3936
3937 if (c->tty_path)
3938 fprintf(f,
3939 "%sTTYPath: %s\n"
3940 "%sTTYReset: %s\n"
3941 "%sTTYVHangup: %s\n"
3942 "%sTTYVTDisallocate: %s\n",
3943 prefix, c->tty_path,
3944 prefix, yes_no(c->tty_reset),
3945 prefix, yes_no(c->tty_vhangup),
3946 prefix, yes_no(c->tty_vt_disallocate));
3947
3948 if (IN_SET(c->std_output,
3949 EXEC_OUTPUT_SYSLOG,
3950 EXEC_OUTPUT_KMSG,
3951 EXEC_OUTPUT_JOURNAL,
3952 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3953 EXEC_OUTPUT_KMSG_AND_CONSOLE,
3954 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
3955 IN_SET(c->std_error,
3956 EXEC_OUTPUT_SYSLOG,
3957 EXEC_OUTPUT_KMSG,
3958 EXEC_OUTPUT_JOURNAL,
3959 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3960 EXEC_OUTPUT_KMSG_AND_CONSOLE,
3961 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
3962
3963 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
3964
3965 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
3966 if (r >= 0)
3967 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
3968
3969 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
3970 if (r >= 0)
3971 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
3972 }
3973
3974 if (c->log_level_max >= 0) {
3975 _cleanup_free_ char *t = NULL;
3976
3977 (void) log_level_to_string_alloc(c->log_level_max, &t);
3978
3979 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
3980 }
3981
3982 if (c->n_log_extra_fields > 0) {
3983 size_t j;
3984
3985 for (j = 0; j < c->n_log_extra_fields; j++) {
3986 fprintf(f, "%sLogExtraFields: ", prefix);
3987 fwrite(c->log_extra_fields[j].iov_base,
3988 1, c->log_extra_fields[j].iov_len,
3989 f);
3990 fputc('\n', f);
3991 }
3992 }
3993
3994 if (c->secure_bits) {
3995 _cleanup_free_ char *str = NULL;
3996
3997 r = secure_bits_to_string_alloc(c->secure_bits, &str);
3998 if (r >= 0)
3999 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4000 }
4001
4002 if (c->capability_bounding_set != CAP_ALL) {
4003 _cleanup_free_ char *str = NULL;
4004
4005 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4006 if (r >= 0)
4007 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4008 }
4009
4010 if (c->capability_ambient_set != 0) {
4011 _cleanup_free_ char *str = NULL;
4012
4013 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4014 if (r >= 0)
4015 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4016 }
4017
4018 if (c->user)
4019 fprintf(f, "%sUser: %s\n", prefix, c->user);
4020 if (c->group)
4021 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4022
4023 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4024
4025 if (!strv_isempty(c->supplementary_groups)) {
4026 fprintf(f, "%sSupplementaryGroups:", prefix);
4027 strv_fprintf(f, c->supplementary_groups);
4028 fputs("\n", f);
4029 }
4030
4031 if (c->pam_name)
4032 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4033
4034 if (strv_length(c->read_write_paths) > 0) {
4035 fprintf(f, "%sReadWritePaths:", prefix);
4036 strv_fprintf(f, c->read_write_paths);
4037 fputs("\n", f);
4038 }
4039
4040 if (strv_length(c->read_only_paths) > 0) {
4041 fprintf(f, "%sReadOnlyPaths:", prefix);
4042 strv_fprintf(f, c->read_only_paths);
4043 fputs("\n", f);
4044 }
4045
4046 if (strv_length(c->inaccessible_paths) > 0) {
4047 fprintf(f, "%sInaccessiblePaths:", prefix);
4048 strv_fprintf(f, c->inaccessible_paths);
4049 fputs("\n", f);
4050 }
4051
4052 if (c->n_bind_mounts > 0)
4053 for (i = 0; i < c->n_bind_mounts; i++) {
4054 fprintf(f, "%s%s: %s:%s:%s\n", prefix,
4055 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4056 c->bind_mounts[i].source,
4057 c->bind_mounts[i].destination,
4058 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4059 }
4060
4061 if (c->utmp_id)
4062 fprintf(f,
4063 "%sUtmpIdentifier: %s\n",
4064 prefix, c->utmp_id);
4065
4066 if (c->selinux_context)
4067 fprintf(f,
4068 "%sSELinuxContext: %s%s\n",
4069 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4070
4071 if (c->apparmor_profile)
4072 fprintf(f,
4073 "%sAppArmorProfile: %s%s\n",
4074 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4075
4076 if (c->smack_process_label)
4077 fprintf(f,
4078 "%sSmackProcessLabel: %s%s\n",
4079 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4080
4081 if (c->personality != PERSONALITY_INVALID)
4082 fprintf(f,
4083 "%sPersonality: %s\n",
4084 prefix, strna(personality_to_string(c->personality)));
4085
4086 fprintf(f,
4087 "%sLockPersonality: %s\n",
4088 prefix, yes_no(c->lock_personality));
4089
4090 if (c->syscall_filter) {
4091 #if HAVE_SECCOMP
4092 Iterator j;
4093 void *id, *val;
4094 bool first = true;
4095 #endif
4096
4097 fprintf(f,
4098 "%sSystemCallFilter: ",
4099 prefix);
4100
4101 if (!c->syscall_whitelist)
4102 fputc('~', f);
4103
4104 #if HAVE_SECCOMP
4105 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4106 _cleanup_free_ char *name = NULL;
4107 const char *errno_name = NULL;
4108 int num = PTR_TO_INT(val);
4109
4110 if (first)
4111 first = false;
4112 else
4113 fputc(' ', f);
4114
4115 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4116 fputs(strna(name), f);
4117
4118 if (num >= 0) {
4119 errno_name = errno_to_name(num);
4120 if (errno_name)
4121 fprintf(f, ":%s", errno_name);
4122 else
4123 fprintf(f, ":%d", num);
4124 }
4125 }
4126 #endif
4127
4128 fputc('\n', f);
4129 }
4130
4131 if (c->syscall_archs) {
4132 #if HAVE_SECCOMP
4133 Iterator j;
4134 void *id;
4135 #endif
4136
4137 fprintf(f,
4138 "%sSystemCallArchitectures:",
4139 prefix);
4140
4141 #if HAVE_SECCOMP
4142 SET_FOREACH(id, c->syscall_archs, j)
4143 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4144 #endif
4145 fputc('\n', f);
4146 }
4147
4148 if (exec_context_restrict_namespaces_set(c)) {
4149 _cleanup_free_ char *s = NULL;
4150
4151 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4152 if (r >= 0)
4153 fprintf(f, "%sRestrictNamespaces: %s\n",
4154 prefix, s);
4155 }
4156
4157 if (c->syscall_errno > 0) {
4158 const char *errno_name;
4159
4160 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4161
4162 errno_name = errno_to_name(c->syscall_errno);
4163 if (errno_name)
4164 fprintf(f, "%s\n", errno_name);
4165 else
4166 fprintf(f, "%d\n", c->syscall_errno);
4167 }
4168
4169 if (c->apparmor_profile)
4170 fprintf(f,
4171 "%sAppArmorProfile: %s%s\n",
4172 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4173 }
4174
4175 bool exec_context_maintains_privileges(ExecContext *c) {
4176 assert(c);
4177
4178 /* Returns true if the process forked off would run under
4179 * an unchanged UID or as root. */
4180
4181 if (!c->user)
4182 return true;
4183
4184 if (streq(c->user, "root") || streq(c->user, "0"))
4185 return true;
4186
4187 return false;
4188 }
4189
4190 int exec_context_get_effective_ioprio(ExecContext *c) {
4191 int p;
4192
4193 assert(c);
4194
4195 if (c->ioprio_set)
4196 return c->ioprio;
4197
4198 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4199 if (p < 0)
4200 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4201
4202 return p;
4203 }
4204
4205 void exec_context_free_log_extra_fields(ExecContext *c) {
4206 size_t l;
4207
4208 assert(c);
4209
4210 for (l = 0; l < c->n_log_extra_fields; l++)
4211 free(c->log_extra_fields[l].iov_base);
4212 c->log_extra_fields = mfree(c->log_extra_fields);
4213 c->n_log_extra_fields = 0;
4214 }
4215
4216 void exec_status_start(ExecStatus *s, pid_t pid) {
4217 assert(s);
4218
4219 zero(*s);
4220 s->pid = pid;
4221 dual_timestamp_get(&s->start_timestamp);
4222 }
4223
4224 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4225 assert(s);
4226
4227 if (s->pid && s->pid != pid)
4228 zero(*s);
4229
4230 s->pid = pid;
4231 dual_timestamp_get(&s->exit_timestamp);
4232
4233 s->code = code;
4234 s->status = status;
4235
4236 if (context) {
4237 if (context->utmp_id)
4238 utmp_put_dead_process(context->utmp_id, pid, code, status);
4239
4240 exec_context_tty_reset(context, NULL);
4241 }
4242 }
4243
4244 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4245 char buf[FORMAT_TIMESTAMP_MAX];
4246
4247 assert(s);
4248 assert(f);
4249
4250 if (s->pid <= 0)
4251 return;
4252
4253 prefix = strempty(prefix);
4254
4255 fprintf(f,
4256 "%sPID: "PID_FMT"\n",
4257 prefix, s->pid);
4258
4259 if (dual_timestamp_is_set(&s->start_timestamp))
4260 fprintf(f,
4261 "%sStart Timestamp: %s\n",
4262 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4263
4264 if (dual_timestamp_is_set(&s->exit_timestamp))
4265 fprintf(f,
4266 "%sExit Timestamp: %s\n"
4267 "%sExit Code: %s\n"
4268 "%sExit Status: %i\n",
4269 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4270 prefix, sigchld_code_to_string(s->code),
4271 prefix, s->status);
4272 }
4273
4274 char *exec_command_line(char **argv) {
4275 size_t k;
4276 char *n, *p, **a;
4277 bool first = true;
4278
4279 assert(argv);
4280
4281 k = 1;
4282 STRV_FOREACH(a, argv)
4283 k += strlen(*a)+3;
4284
4285 n = new(char, k);
4286 if (!n)
4287 return NULL;
4288
4289 p = n;
4290 STRV_FOREACH(a, argv) {
4291
4292 if (!first)
4293 *(p++) = ' ';
4294 else
4295 first = false;
4296
4297 if (strpbrk(*a, WHITESPACE)) {
4298 *(p++) = '\'';
4299 p = stpcpy(p, *a);
4300 *(p++) = '\'';
4301 } else
4302 p = stpcpy(p, *a);
4303
4304 }
4305
4306 *p = 0;
4307
4308 /* FIXME: this doesn't really handle arguments that have
4309 * spaces and ticks in them */
4310
4311 return n;
4312 }
4313
4314 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4315 _cleanup_free_ char *cmd = NULL;
4316 const char *prefix2;
4317
4318 assert(c);
4319 assert(f);
4320
4321 prefix = strempty(prefix);
4322 prefix2 = strjoina(prefix, "\t");
4323
4324 cmd = exec_command_line(c->argv);
4325 fprintf(f,
4326 "%sCommand Line: %s\n",
4327 prefix, cmd ? cmd : strerror(ENOMEM));
4328
4329 exec_status_dump(&c->exec_status, f, prefix2);
4330 }
4331
4332 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4333 assert(f);
4334
4335 prefix = strempty(prefix);
4336
4337 LIST_FOREACH(command, c, c)
4338 exec_command_dump(c, f, prefix);
4339 }
4340
4341 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4342 ExecCommand *end;
4343
4344 assert(l);
4345 assert(e);
4346
4347 if (*l) {
4348 /* It's kind of important, that we keep the order here */
4349 LIST_FIND_TAIL(command, *l, end);
4350 LIST_INSERT_AFTER(command, *l, end, e);
4351 } else
4352 *l = e;
4353 }
4354
4355 int exec_command_set(ExecCommand *c, const char *path, ...) {
4356 va_list ap;
4357 char **l, *p;
4358
4359 assert(c);
4360 assert(path);
4361
4362 va_start(ap, path);
4363 l = strv_new_ap(path, ap);
4364 va_end(ap);
4365
4366 if (!l)
4367 return -ENOMEM;
4368
4369 p = strdup(path);
4370 if (!p) {
4371 strv_free(l);
4372 return -ENOMEM;
4373 }
4374
4375 free(c->path);
4376 c->path = p;
4377
4378 strv_free(c->argv);
4379 c->argv = l;
4380
4381 return 0;
4382 }
4383
4384 int exec_command_append(ExecCommand *c, const char *path, ...) {
4385 _cleanup_strv_free_ char **l = NULL;
4386 va_list ap;
4387 int r;
4388
4389 assert(c);
4390 assert(path);
4391
4392 va_start(ap, path);
4393 l = strv_new_ap(path, ap);
4394 va_end(ap);
4395
4396 if (!l)
4397 return -ENOMEM;
4398
4399 r = strv_extend_strv(&c->argv, l, false);
4400 if (r < 0)
4401 return r;
4402
4403 return 0;
4404 }
4405
4406
4407 static int exec_runtime_allocate(ExecRuntime **rt) {
4408
4409 if (*rt)
4410 return 0;
4411
4412 *rt = new0(ExecRuntime, 1);
4413 if (!*rt)
4414 return -ENOMEM;
4415
4416 (*rt)->n_ref = 1;
4417 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4418
4419 return 0;
4420 }
4421
4422 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4423 int r;
4424
4425 assert(rt);
4426 assert(c);
4427 assert(id);
4428
4429 if (*rt)
4430 return 1;
4431
4432 if (!c->private_network && !c->private_tmp)
4433 return 0;
4434
4435 r = exec_runtime_allocate(rt);
4436 if (r < 0)
4437 return r;
4438
4439 if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4440 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4441 return -errno;
4442 }
4443
4444 if (c->private_tmp && !(*rt)->tmp_dir) {
4445 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4446 if (r < 0)
4447 return r;
4448 }
4449
4450 return 1;
4451 }
4452
4453 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4454 assert(r);
4455 assert(r->n_ref > 0);
4456
4457 r->n_ref++;
4458 return r;
4459 }
4460
4461 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4462
4463 if (!r)
4464 return NULL;
4465
4466 assert(r->n_ref > 0);
4467
4468 r->n_ref--;
4469 if (r->n_ref > 0)
4470 return NULL;
4471
4472 free(r->tmp_dir);
4473 free(r->var_tmp_dir);
4474 safe_close_pair(r->netns_storage_socket);
4475 return mfree(r);
4476 }
4477
4478 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4479 assert(u);
4480 assert(f);
4481 assert(fds);
4482
4483 if (!rt)
4484 return 0;
4485
4486 if (rt->tmp_dir)
4487 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4488
4489 if (rt->var_tmp_dir)
4490 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4491
4492 if (rt->netns_storage_socket[0] >= 0) {
4493 int copy;
4494
4495 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4496 if (copy < 0)
4497 return copy;
4498
4499 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4500 }
4501
4502 if (rt->netns_storage_socket[1] >= 0) {
4503 int copy;
4504
4505 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4506 if (copy < 0)
4507 return copy;
4508
4509 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4510 }
4511
4512 return 0;
4513 }
4514
4515 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4516 int r;
4517
4518 assert(rt);
4519 assert(key);
4520 assert(value);
4521
4522 if (streq(key, "tmp-dir")) {
4523 char *copy;
4524
4525 r = exec_runtime_allocate(rt);
4526 if (r < 0)
4527 return log_oom();
4528
4529 copy = strdup(value);
4530 if (!copy)
4531 return log_oom();
4532
4533 free((*rt)->tmp_dir);
4534 (*rt)->tmp_dir = copy;
4535
4536 } else if (streq(key, "var-tmp-dir")) {
4537 char *copy;
4538
4539 r = exec_runtime_allocate(rt);
4540 if (r < 0)
4541 return log_oom();
4542
4543 copy = strdup(value);
4544 if (!copy)
4545 return log_oom();
4546
4547 free((*rt)->var_tmp_dir);
4548 (*rt)->var_tmp_dir = copy;
4549
4550 } else if (streq(key, "netns-socket-0")) {
4551 int fd;
4552
4553 r = exec_runtime_allocate(rt);
4554 if (r < 0)
4555 return log_oom();
4556
4557 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4558 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4559 else {
4560 safe_close((*rt)->netns_storage_socket[0]);
4561 (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4562 }
4563 } else if (streq(key, "netns-socket-1")) {
4564 int fd;
4565
4566 r = exec_runtime_allocate(rt);
4567 if (r < 0)
4568 return log_oom();
4569
4570 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4571 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4572 else {
4573 safe_close((*rt)->netns_storage_socket[1]);
4574 (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4575 }
4576 } else
4577 return 0;
4578
4579 return 1;
4580 }
4581
4582 static void *remove_tmpdir_thread(void *p) {
4583 _cleanup_free_ char *path = p;
4584
4585 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4586 return NULL;
4587 }
4588
4589 void exec_runtime_destroy(ExecRuntime *rt) {
4590 int r;
4591
4592 if (!rt)
4593 return;
4594
4595 /* If there are multiple users of this, let's leave the stuff around */
4596 if (rt->n_ref > 1)
4597 return;
4598
4599 if (rt->tmp_dir) {
4600 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4601
4602 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4603 if (r < 0) {
4604 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4605 free(rt->tmp_dir);
4606 }
4607
4608 rt->tmp_dir = NULL;
4609 }
4610
4611 if (rt->var_tmp_dir) {
4612 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4613
4614 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4615 if (r < 0) {
4616 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4617 free(rt->var_tmp_dir);
4618 }
4619
4620 rt->var_tmp_dir = NULL;
4621 }
4622
4623 safe_close_pair(rt->netns_storage_socket);
4624 }
4625
4626 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4627 [EXEC_INPUT_NULL] = "null",
4628 [EXEC_INPUT_TTY] = "tty",
4629 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4630 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4631 [EXEC_INPUT_SOCKET] = "socket",
4632 [EXEC_INPUT_NAMED_FD] = "fd",
4633 };
4634
4635 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4636
4637 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4638 [EXEC_OUTPUT_INHERIT] = "inherit",
4639 [EXEC_OUTPUT_NULL] = "null",
4640 [EXEC_OUTPUT_TTY] = "tty",
4641 [EXEC_OUTPUT_SYSLOG] = "syslog",
4642 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4643 [EXEC_OUTPUT_KMSG] = "kmsg",
4644 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4645 [EXEC_OUTPUT_JOURNAL] = "journal",
4646 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4647 [EXEC_OUTPUT_SOCKET] = "socket",
4648 [EXEC_OUTPUT_NAMED_FD] = "fd",
4649 };
4650
4651 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4652
4653 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4654 [EXEC_UTMP_INIT] = "init",
4655 [EXEC_UTMP_LOGIN] = "login",
4656 [EXEC_UTMP_USER] = "user",
4657 };
4658
4659 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4660
4661 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4662 [EXEC_PRESERVE_NO] = "no",
4663 [EXEC_PRESERVE_YES] = "yes",
4664 [EXEC_PRESERVE_RESTART] = "restart",
4665 };
4666
4667 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4668
4669 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4670 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4671 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4672 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4673 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4674 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4675 };
4676
4677 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4678
4679 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4680 [EXEC_KEYRING_INHERIT] = "inherit",
4681 [EXEC_KEYRING_PRIVATE] = "private",
4682 [EXEC_KEYRING_SHARED] = "shared",
4683 };
4684
4685 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);