]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
build-sys: use #if Y instead of #ifdef Y everywhere
[thirdparty/systemd.git] / src / core / execute.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <glob.h>
23 #include <grp.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <string.h>
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
29 #include <sys/mman.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
32 #include <sys/shm.h>
33 #include <sys/socket.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <sys/un.h>
37 #include <unistd.h>
38 #include <utmpx.h>
39
40 #if HAVE_PAM
41 #include <security/pam_appl.h>
42 #endif
43
44 #if HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #if HAVE_SECCOMP
49 #include <seccomp.h>
50 #endif
51
52 #if HAVE_APPARMOR
53 #include <sys/apparmor.h>
54 #endif
55
56 #include "sd-messages.h"
57
58 #include "af-list.h"
59 #include "alloc-util.h"
60 #if HAVE_APPARMOR
61 #include "apparmor-util.h"
62 #endif
63 #include "async.h"
64 #include "barrier.h"
65 #include "cap-list.h"
66 #include "capability-util.h"
67 #include "chown-recursive.h"
68 #include "def.h"
69 #include "env-util.h"
70 #include "errno-list.h"
71 #include "execute.h"
72 #include "exit-status.h"
73 #include "fd-util.h"
74 #include "fileio.h"
75 #include "format-util.h"
76 #include "fs-util.h"
77 #include "glob-util.h"
78 #include "io-util.h"
79 #include "ioprio.h"
80 #include "label.h"
81 #include "log.h"
82 #include "macro.h"
83 #include "missing.h"
84 #include "mkdir.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "process-util.h"
89 #include "rlimit-util.h"
90 #include "rm-rf.h"
91 #if HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "securebits.h"
95 #include "securebits-util.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "smack-util.h"
99 #include "special.h"
100 #include "string-table.h"
101 #include "string-util.h"
102 #include "strv.h"
103 #include "syslog-util.h"
104 #include "terminal-util.h"
105 #include "unit.h"
106 #include "user-util.h"
107 #include "util.h"
108 #include "utmp-wtmp.h"
109
110 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
111 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
112
113 /* This assumes there is a 'tty' group */
114 #define TTY_MODE 0620
115
116 #define SNDBUF_SIZE (8*1024*1024)
117
118 static int shift_fds(int fds[], unsigned n_fds) {
119 int start, restart_from;
120
121 if (n_fds <= 0)
122 return 0;
123
124 /* Modifies the fds array! (sorts it) */
125
126 assert(fds);
127
128 start = 0;
129 for (;;) {
130 int i;
131
132 restart_from = -1;
133
134 for (i = start; i < (int) n_fds; i++) {
135 int nfd;
136
137 /* Already at right index? */
138 if (fds[i] == i+3)
139 continue;
140
141 nfd = fcntl(fds[i], F_DUPFD, i + 3);
142 if (nfd < 0)
143 return -errno;
144
145 safe_close(fds[i]);
146 fds[i] = nfd;
147
148 /* Hmm, the fd we wanted isn't free? Then
149 * let's remember that and try again from here */
150 if (nfd != i+3 && restart_from < 0)
151 restart_from = i;
152 }
153
154 if (restart_from < 0)
155 break;
156
157 start = restart_from;
158 }
159
160 return 0;
161 }
162
163 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
164 unsigned i, n_fds;
165 int r;
166
167 n_fds = n_storage_fds + n_socket_fds;
168 if (n_fds <= 0)
169 return 0;
170
171 assert(fds);
172
173 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
174 * O_NONBLOCK only applies to socket activation though. */
175
176 for (i = 0; i < n_fds; i++) {
177
178 if (i < n_socket_fds) {
179 r = fd_nonblock(fds[i], nonblock);
180 if (r < 0)
181 return r;
182 }
183
184 /* We unconditionally drop FD_CLOEXEC from the fds,
185 * since after all we want to pass these fds to our
186 * children */
187
188 r = fd_cloexec(fds[i], false);
189 if (r < 0)
190 return r;
191 }
192
193 return 0;
194 }
195
196 static const char *exec_context_tty_path(const ExecContext *context) {
197 assert(context);
198
199 if (context->stdio_as_fds)
200 return NULL;
201
202 if (context->tty_path)
203 return context->tty_path;
204
205 return "/dev/console";
206 }
207
208 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
209 const char *path;
210
211 assert(context);
212
213 path = exec_context_tty_path(context);
214
215 if (context->tty_vhangup) {
216 if (p && p->stdin_fd >= 0)
217 (void) terminal_vhangup_fd(p->stdin_fd);
218 else if (path)
219 (void) terminal_vhangup(path);
220 }
221
222 if (context->tty_reset) {
223 if (p && p->stdin_fd >= 0)
224 (void) reset_terminal_fd(p->stdin_fd, true);
225 else if (path)
226 (void) reset_terminal(path);
227 }
228
229 if (context->tty_vt_disallocate && path)
230 (void) vt_disallocate(path);
231 }
232
233 static bool is_terminal_input(ExecInput i) {
234 return IN_SET(i,
235 EXEC_INPUT_TTY,
236 EXEC_INPUT_TTY_FORCE,
237 EXEC_INPUT_TTY_FAIL);
238 }
239
240 static bool is_terminal_output(ExecOutput o) {
241 return IN_SET(o,
242 EXEC_OUTPUT_TTY,
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
244 EXEC_OUTPUT_KMSG_AND_CONSOLE,
245 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
246 }
247
248 static bool is_syslog_output(ExecOutput o) {
249 return IN_SET(o,
250 EXEC_OUTPUT_SYSLOG,
251 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
252 }
253
254 static bool is_kmsg_output(ExecOutput o) {
255 return IN_SET(o,
256 EXEC_OUTPUT_KMSG,
257 EXEC_OUTPUT_KMSG_AND_CONSOLE);
258 }
259
260 static bool exec_context_needs_term(const ExecContext *c) {
261 assert(c);
262
263 /* Return true if the execution context suggests we should set $TERM to something useful. */
264
265 if (is_terminal_input(c->std_input))
266 return true;
267
268 if (is_terminal_output(c->std_output))
269 return true;
270
271 if (is_terminal_output(c->std_error))
272 return true;
273
274 return !!c->tty_path;
275 }
276
277 static int open_null_as(int flags, int nfd) {
278 int fd, r;
279
280 assert(nfd >= 0);
281
282 fd = open("/dev/null", flags|O_NOCTTY);
283 if (fd < 0)
284 return -errno;
285
286 if (fd != nfd) {
287 r = dup2(fd, nfd) < 0 ? -errno : nfd;
288 safe_close(fd);
289 } else
290 r = nfd;
291
292 return r;
293 }
294
295 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
296 static const union sockaddr_union sa = {
297 .un.sun_family = AF_UNIX,
298 .un.sun_path = "/run/systemd/journal/stdout",
299 };
300 uid_t olduid = UID_INVALID;
301 gid_t oldgid = GID_INVALID;
302 int r;
303
304 if (gid_is_valid(gid)) {
305 oldgid = getgid();
306
307 if (setegid(gid) < 0)
308 return -errno;
309 }
310
311 if (uid_is_valid(uid)) {
312 olduid = getuid();
313
314 if (seteuid(uid) < 0) {
315 r = -errno;
316 goto restore_gid;
317 }
318 }
319
320 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
321
322 /* If we fail to restore the uid or gid, things will likely
323 fail later on. This should only happen if an LSM interferes. */
324
325 if (uid_is_valid(uid))
326 (void) seteuid(olduid);
327
328 restore_gid:
329 if (gid_is_valid(gid))
330 (void) setegid(oldgid);
331
332 return r;
333 }
334
335 static int connect_logger_as(
336 Unit *unit,
337 const ExecContext *context,
338 const ExecParameters *params,
339 ExecOutput output,
340 const char *ident,
341 int nfd,
342 uid_t uid,
343 gid_t gid) {
344
345 int fd, r;
346
347 assert(context);
348 assert(params);
349 assert(output < _EXEC_OUTPUT_MAX);
350 assert(ident);
351 assert(nfd >= 0);
352
353 fd = socket(AF_UNIX, SOCK_STREAM, 0);
354 if (fd < 0)
355 return -errno;
356
357 r = connect_journal_socket(fd, uid, gid);
358 if (r < 0)
359 return r;
360
361 if (shutdown(fd, SHUT_RD) < 0) {
362 safe_close(fd);
363 return -errno;
364 }
365
366 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
367
368 dprintf(fd,
369 "%s\n"
370 "%s\n"
371 "%i\n"
372 "%i\n"
373 "%i\n"
374 "%i\n"
375 "%i\n",
376 context->syslog_identifier ?: ident,
377 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
378 context->syslog_priority,
379 !!context->syslog_level_prefix,
380 is_syslog_output(output),
381 is_kmsg_output(output),
382 is_terminal_output(output));
383
384 if (fd == nfd)
385 return nfd;
386
387 r = dup2(fd, nfd) < 0 ? -errno : nfd;
388 safe_close(fd);
389
390 return r;
391 }
392 static int open_terminal_as(const char *path, mode_t mode, int nfd) {
393 int fd, r;
394
395 assert(path);
396 assert(nfd >= 0);
397
398 fd = open_terminal(path, mode | O_NOCTTY);
399 if (fd < 0)
400 return fd;
401
402 if (fd != nfd) {
403 r = dup2(fd, nfd) < 0 ? -errno : nfd;
404 safe_close(fd);
405 } else
406 r = nfd;
407
408 return r;
409 }
410
411 static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
412
413 if (is_terminal_input(std_input) && !apply_tty_stdin)
414 return EXEC_INPUT_NULL;
415
416 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
417 return EXEC_INPUT_NULL;
418
419 return std_input;
420 }
421
422 static int fixup_output(ExecOutput std_output, int socket_fd) {
423
424 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
425 return EXEC_OUTPUT_INHERIT;
426
427 return std_output;
428 }
429
430 static int setup_input(
431 const ExecContext *context,
432 const ExecParameters *params,
433 int socket_fd,
434 int named_iofds[3]) {
435
436 ExecInput i;
437
438 assert(context);
439 assert(params);
440
441 if (params->stdin_fd >= 0) {
442 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
443 return -errno;
444
445 /* Try to make this the controlling tty, if it is a tty, and reset it */
446 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
447 (void) reset_terminal_fd(STDIN_FILENO, true);
448
449 return STDIN_FILENO;
450 }
451
452 i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
453
454 switch (i) {
455
456 case EXEC_INPUT_NULL:
457 return open_null_as(O_RDONLY, STDIN_FILENO);
458
459 case EXEC_INPUT_TTY:
460 case EXEC_INPUT_TTY_FORCE:
461 case EXEC_INPUT_TTY_FAIL: {
462 int fd, r;
463
464 fd = acquire_terminal(exec_context_tty_path(context),
465 i == EXEC_INPUT_TTY_FAIL,
466 i == EXEC_INPUT_TTY_FORCE,
467 false,
468 USEC_INFINITY);
469 if (fd < 0)
470 return fd;
471
472 if (fd != STDIN_FILENO) {
473 r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
474 safe_close(fd);
475 } else
476 r = STDIN_FILENO;
477
478 return r;
479 }
480
481 case EXEC_INPUT_SOCKET:
482 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
483
484 case EXEC_INPUT_NAMED_FD:
485 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
486 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
487
488 default:
489 assert_not_reached("Unknown input type");
490 }
491 }
492
493 static int setup_output(
494 Unit *unit,
495 const ExecContext *context,
496 const ExecParameters *params,
497 int fileno,
498 int socket_fd,
499 int named_iofds[3],
500 const char *ident,
501 uid_t uid,
502 gid_t gid,
503 dev_t *journal_stream_dev,
504 ino_t *journal_stream_ino) {
505
506 ExecOutput o;
507 ExecInput i;
508 int r;
509
510 assert(unit);
511 assert(context);
512 assert(params);
513 assert(ident);
514 assert(journal_stream_dev);
515 assert(journal_stream_ino);
516
517 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
518
519 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
520 return -errno;
521
522 return STDOUT_FILENO;
523 }
524
525 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
526 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
527 return -errno;
528
529 return STDERR_FILENO;
530 }
531
532 i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
533 o = fixup_output(context->std_output, socket_fd);
534
535 if (fileno == STDERR_FILENO) {
536 ExecOutput e;
537 e = fixup_output(context->std_error, socket_fd);
538
539 /* This expects the input and output are already set up */
540
541 /* Don't change the stderr file descriptor if we inherit all
542 * the way and are not on a tty */
543 if (e == EXEC_OUTPUT_INHERIT &&
544 o == EXEC_OUTPUT_INHERIT &&
545 i == EXEC_INPUT_NULL &&
546 !is_terminal_input(context->std_input) &&
547 getppid () != 1)
548 return fileno;
549
550 /* Duplicate from stdout if possible */
551 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
552 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
553
554 o = e;
555
556 } else if (o == EXEC_OUTPUT_INHERIT) {
557 /* If input got downgraded, inherit the original value */
558 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
559 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
560
561 /* If the input is connected to anything that's not a /dev/null, inherit that... */
562 if (i != EXEC_INPUT_NULL)
563 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
564
565 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
566 if (getppid() != 1)
567 return fileno;
568
569 /* We need to open /dev/null here anew, to get the right access mode. */
570 return open_null_as(O_WRONLY, fileno);
571 }
572
573 switch (o) {
574
575 case EXEC_OUTPUT_NULL:
576 return open_null_as(O_WRONLY, fileno);
577
578 case EXEC_OUTPUT_TTY:
579 if (is_terminal_input(i))
580 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
581
582 /* We don't reset the terminal if this is just about output */
583 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
584
585 case EXEC_OUTPUT_SYSLOG:
586 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
587 case EXEC_OUTPUT_KMSG:
588 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
589 case EXEC_OUTPUT_JOURNAL:
590 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
591 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
592 if (r < 0) {
593 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
594 r = open_null_as(O_WRONLY, fileno);
595 } else {
596 struct stat st;
597
598 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
599 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
600 * services to detect whether they are connected to the journal or not.
601 *
602 * If both stdout and stderr are connected to a stream then let's make sure to store the data
603 * about STDERR as that's usually the best way to do logging. */
604
605 if (fstat(fileno, &st) >= 0 &&
606 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
607 *journal_stream_dev = st.st_dev;
608 *journal_stream_ino = st.st_ino;
609 }
610 }
611 return r;
612
613 case EXEC_OUTPUT_SOCKET:
614 assert(socket_fd >= 0);
615 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
616
617 case EXEC_OUTPUT_NAMED_FD:
618 (void) fd_nonblock(named_iofds[fileno], false);
619 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
620
621 default:
622 assert_not_reached("Unknown error type");
623 }
624 }
625
626 static int chown_terminal(int fd, uid_t uid) {
627 struct stat st;
628
629 assert(fd >= 0);
630
631 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
632 if (isatty(fd) < 1)
633 return 0;
634
635 /* This might fail. What matters are the results. */
636 (void) fchown(fd, uid, -1);
637 (void) fchmod(fd, TTY_MODE);
638
639 if (fstat(fd, &st) < 0)
640 return -errno;
641
642 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
643 return -EPERM;
644
645 return 0;
646 }
647
648 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
649 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
650 int r;
651
652 assert(_saved_stdin);
653 assert(_saved_stdout);
654
655 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
656 if (saved_stdin < 0)
657 return -errno;
658
659 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
660 if (saved_stdout < 0)
661 return -errno;
662
663 fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
664 if (fd < 0)
665 return fd;
666
667 r = chown_terminal(fd, getuid());
668 if (r < 0)
669 return r;
670
671 r = reset_terminal_fd(fd, true);
672 if (r < 0)
673 return r;
674
675 if (dup2(fd, STDIN_FILENO) < 0)
676 return -errno;
677
678 if (dup2(fd, STDOUT_FILENO) < 0)
679 return -errno;
680
681 if (fd >= 2)
682 safe_close(fd);
683 fd = -1;
684
685 *_saved_stdin = saved_stdin;
686 *_saved_stdout = saved_stdout;
687
688 saved_stdin = saved_stdout = -1;
689
690 return 0;
691 }
692
693 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
694 assert(err < 0);
695
696 if (err == -ETIMEDOUT)
697 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
698 else {
699 errno = -err;
700 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
701 }
702 }
703
704 static void write_confirm_error(int err, const char *vc, const Unit *u) {
705 _cleanup_close_ int fd = -1;
706
707 assert(vc);
708
709 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
710 if (fd < 0)
711 return;
712
713 write_confirm_error_fd(err, fd, u);
714 }
715
716 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
717 int r = 0;
718
719 assert(saved_stdin);
720 assert(saved_stdout);
721
722 release_terminal();
723
724 if (*saved_stdin >= 0)
725 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
726 r = -errno;
727
728 if (*saved_stdout >= 0)
729 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
730 r = -errno;
731
732 *saved_stdin = safe_close(*saved_stdin);
733 *saved_stdout = safe_close(*saved_stdout);
734
735 return r;
736 }
737
738 enum {
739 CONFIRM_PRETEND_FAILURE = -1,
740 CONFIRM_PRETEND_SUCCESS = 0,
741 CONFIRM_EXECUTE = 1,
742 };
743
744 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
745 int saved_stdout = -1, saved_stdin = -1, r;
746 _cleanup_free_ char *e = NULL;
747 char c;
748
749 /* For any internal errors, assume a positive response. */
750 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
751 if (r < 0) {
752 write_confirm_error(r, vc, u);
753 return CONFIRM_EXECUTE;
754 }
755
756 /* confirm_spawn might have been disabled while we were sleeping. */
757 if (manager_is_confirm_spawn_disabled(u->manager)) {
758 r = 1;
759 goto restore_stdio;
760 }
761
762 e = ellipsize(cmdline, 60, 100);
763 if (!e) {
764 log_oom();
765 r = CONFIRM_EXECUTE;
766 goto restore_stdio;
767 }
768
769 for (;;) {
770 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
771 if (r < 0) {
772 write_confirm_error_fd(r, STDOUT_FILENO, u);
773 r = CONFIRM_EXECUTE;
774 goto restore_stdio;
775 }
776
777 switch (c) {
778 case 'c':
779 printf("Resuming normal execution.\n");
780 manager_disable_confirm_spawn();
781 r = 1;
782 break;
783 case 'D':
784 unit_dump(u, stdout, " ");
785 continue; /* ask again */
786 case 'f':
787 printf("Failing execution.\n");
788 r = CONFIRM_PRETEND_FAILURE;
789 break;
790 case 'h':
791 printf(" c - continue, proceed without asking anymore\n"
792 " D - dump, show the state of the unit\n"
793 " f - fail, don't execute the command and pretend it failed\n"
794 " h - help\n"
795 " i - info, show a short summary of the unit\n"
796 " j - jobs, show jobs that are in progress\n"
797 " s - skip, don't execute the command and pretend it succeeded\n"
798 " y - yes, execute the command\n");
799 continue; /* ask again */
800 case 'i':
801 printf(" Description: %s\n"
802 " Unit: %s\n"
803 " Command: %s\n",
804 u->id, u->description, cmdline);
805 continue; /* ask again */
806 case 'j':
807 manager_dump_jobs(u->manager, stdout, " ");
808 continue; /* ask again */
809 case 'n':
810 /* 'n' was removed in favor of 'f'. */
811 printf("Didn't understand 'n', did you mean 'f'?\n");
812 continue; /* ask again */
813 case 's':
814 printf("Skipping execution.\n");
815 r = CONFIRM_PRETEND_SUCCESS;
816 break;
817 case 'y':
818 r = CONFIRM_EXECUTE;
819 break;
820 default:
821 assert_not_reached("Unhandled choice");
822 }
823 break;
824 }
825
826 restore_stdio:
827 restore_confirm_stdio(&saved_stdin, &saved_stdout);
828 return r;
829 }
830
831 static int get_fixed_user(const ExecContext *c, const char **user,
832 uid_t *uid, gid_t *gid,
833 const char **home, const char **shell) {
834 int r;
835 const char *name;
836
837 assert(c);
838
839 if (!c->user)
840 return 0;
841
842 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
843 * (i.e. are "/" or "/bin/nologin"). */
844
845 name = c->user;
846 r = get_user_creds_clean(&name, uid, gid, home, shell);
847 if (r < 0)
848 return r;
849
850 *user = name;
851 return 0;
852 }
853
854 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
855 int r;
856 const char *name;
857
858 assert(c);
859
860 if (!c->group)
861 return 0;
862
863 name = c->group;
864 r = get_group_creds(&name, gid);
865 if (r < 0)
866 return r;
867
868 *group = name;
869 return 0;
870 }
871
872 static int get_supplementary_groups(const ExecContext *c, const char *user,
873 const char *group, gid_t gid,
874 gid_t **supplementary_gids, int *ngids) {
875 char **i;
876 int r, k = 0;
877 int ngroups_max;
878 bool keep_groups = false;
879 gid_t *groups = NULL;
880 _cleanup_free_ gid_t *l_gids = NULL;
881
882 assert(c);
883
884 /*
885 * If user is given, then lookup GID and supplementary groups list.
886 * We avoid NSS lookups for gid=0. Also we have to initialize groups
887 * here and as early as possible so we keep the list of supplementary
888 * groups of the caller.
889 */
890 if (user && gid_is_valid(gid) && gid != 0) {
891 /* First step, initialize groups from /etc/groups */
892 if (initgroups(user, gid) < 0)
893 return -errno;
894
895 keep_groups = true;
896 }
897
898 if (!c->supplementary_groups)
899 return 0;
900
901 /*
902 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
903 * be positive, otherwise fail.
904 */
905 errno = 0;
906 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
907 if (ngroups_max <= 0) {
908 if (errno > 0)
909 return -errno;
910 else
911 return -EOPNOTSUPP; /* For all other values */
912 }
913
914 l_gids = new(gid_t, ngroups_max);
915 if (!l_gids)
916 return -ENOMEM;
917
918 if (keep_groups) {
919 /*
920 * Lookup the list of groups that the user belongs to, we
921 * avoid NSS lookups here too for gid=0.
922 */
923 k = ngroups_max;
924 if (getgrouplist(user, gid, l_gids, &k) < 0)
925 return -EINVAL;
926 } else
927 k = 0;
928
929 STRV_FOREACH(i, c->supplementary_groups) {
930 const char *g;
931
932 if (k >= ngroups_max)
933 return -E2BIG;
934
935 g = *i;
936 r = get_group_creds(&g, l_gids+k);
937 if (r < 0)
938 return r;
939
940 k++;
941 }
942
943 /*
944 * Sets ngids to zero to drop all supplementary groups, happens
945 * when we are under root and SupplementaryGroups= is empty.
946 */
947 if (k == 0) {
948 *ngids = 0;
949 return 0;
950 }
951
952 /* Otherwise get the final list of supplementary groups */
953 groups = memdup(l_gids, sizeof(gid_t) * k);
954 if (!groups)
955 return -ENOMEM;
956
957 *supplementary_gids = groups;
958 *ngids = k;
959
960 groups = NULL;
961
962 return 0;
963 }
964
965 static int enforce_groups(const ExecContext *context, gid_t gid,
966 gid_t *supplementary_gids, int ngids) {
967 int r;
968
969 assert(context);
970
971 /* Handle SupplementaryGroups= even if it is empty */
972 if (context->supplementary_groups) {
973 r = maybe_setgroups(ngids, supplementary_gids);
974 if (r < 0)
975 return r;
976 }
977
978 if (gid_is_valid(gid)) {
979 /* Then set our gids */
980 if (setresgid(gid, gid, gid) < 0)
981 return -errno;
982 }
983
984 return 0;
985 }
986
987 static int enforce_user(const ExecContext *context, uid_t uid) {
988 assert(context);
989
990 if (!uid_is_valid(uid))
991 return 0;
992
993 /* Sets (but doesn't look up) the uid and make sure we keep the
994 * capabilities while doing so. */
995
996 if (context->capability_ambient_set != 0) {
997
998 /* First step: If we need to keep capabilities but
999 * drop privileges we need to make sure we keep our
1000 * caps, while we drop privileges. */
1001 if (uid != 0) {
1002 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1003
1004 if (prctl(PR_GET_SECUREBITS) != sb)
1005 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1006 return -errno;
1007 }
1008 }
1009
1010 /* Second step: actually set the uids */
1011 if (setresuid(uid, uid, uid) < 0)
1012 return -errno;
1013
1014 /* At this point we should have all necessary capabilities but
1015 are otherwise a normal user. However, the caps might got
1016 corrupted due to the setresuid() so we need clean them up
1017 later. This is done outside of this call. */
1018
1019 return 0;
1020 }
1021
1022 #if HAVE_PAM
1023
1024 static int null_conv(
1025 int num_msg,
1026 const struct pam_message **msg,
1027 struct pam_response **resp,
1028 void *appdata_ptr) {
1029
1030 /* We don't support conversations */
1031
1032 return PAM_CONV_ERR;
1033 }
1034
1035 #endif
1036
1037 static int setup_pam(
1038 const char *name,
1039 const char *user,
1040 uid_t uid,
1041 gid_t gid,
1042 const char *tty,
1043 char ***env,
1044 int fds[], unsigned n_fds) {
1045
1046 #if HAVE_PAM
1047
1048 static const struct pam_conv conv = {
1049 .conv = null_conv,
1050 .appdata_ptr = NULL
1051 };
1052
1053 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1054 pam_handle_t *handle = NULL;
1055 sigset_t old_ss;
1056 int pam_code = PAM_SUCCESS, r;
1057 char **nv, **e = NULL;
1058 bool close_session = false;
1059 pid_t pam_pid = 0, parent_pid;
1060 int flags = 0;
1061
1062 assert(name);
1063 assert(user);
1064 assert(env);
1065
1066 /* We set up PAM in the parent process, then fork. The child
1067 * will then stay around until killed via PR_GET_PDEATHSIG or
1068 * systemd via the cgroup logic. It will then remove the PAM
1069 * session again. The parent process will exec() the actual
1070 * daemon. We do things this way to ensure that the main PID
1071 * of the daemon is the one we initially fork()ed. */
1072
1073 r = barrier_create(&barrier);
1074 if (r < 0)
1075 goto fail;
1076
1077 if (log_get_max_level() < LOG_DEBUG)
1078 flags |= PAM_SILENT;
1079
1080 pam_code = pam_start(name, user, &conv, &handle);
1081 if (pam_code != PAM_SUCCESS) {
1082 handle = NULL;
1083 goto fail;
1084 }
1085
1086 if (tty) {
1087 pam_code = pam_set_item(handle, PAM_TTY, tty);
1088 if (pam_code != PAM_SUCCESS)
1089 goto fail;
1090 }
1091
1092 STRV_FOREACH(nv, *env) {
1093 pam_code = pam_putenv(handle, *nv);
1094 if (pam_code != PAM_SUCCESS)
1095 goto fail;
1096 }
1097
1098 pam_code = pam_acct_mgmt(handle, flags);
1099 if (pam_code != PAM_SUCCESS)
1100 goto fail;
1101
1102 pam_code = pam_open_session(handle, flags);
1103 if (pam_code != PAM_SUCCESS)
1104 goto fail;
1105
1106 close_session = true;
1107
1108 e = pam_getenvlist(handle);
1109 if (!e) {
1110 pam_code = PAM_BUF_ERR;
1111 goto fail;
1112 }
1113
1114 /* Block SIGTERM, so that we know that it won't get lost in
1115 * the child */
1116
1117 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1118
1119 parent_pid = getpid_cached();
1120
1121 pam_pid = fork();
1122 if (pam_pid < 0) {
1123 r = -errno;
1124 goto fail;
1125 }
1126
1127 if (pam_pid == 0) {
1128 int sig, ret = EXIT_PAM;
1129
1130 /* The child's job is to reset the PAM session on
1131 * termination */
1132 barrier_set_role(&barrier, BARRIER_CHILD);
1133
1134 /* This string must fit in 10 chars (i.e. the length
1135 * of "/sbin/init"), to look pretty in /bin/ps */
1136 rename_process("(sd-pam)");
1137
1138 /* Make sure we don't keep open the passed fds in this
1139 child. We assume that otherwise only those fds are
1140 open here that have been opened by PAM. */
1141 close_many(fds, n_fds);
1142
1143 /* Drop privileges - we don't need any to pam_close_session
1144 * and this will make PR_SET_PDEATHSIG work in most cases.
1145 * If this fails, ignore the error - but expect sd-pam threads
1146 * to fail to exit normally */
1147
1148 r = maybe_setgroups(0, NULL);
1149 if (r < 0)
1150 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1151 if (setresgid(gid, gid, gid) < 0)
1152 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1153 if (setresuid(uid, uid, uid) < 0)
1154 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1155
1156 (void) ignore_signals(SIGPIPE, -1);
1157
1158 /* Wait until our parent died. This will only work if
1159 * the above setresuid() succeeds, otherwise the kernel
1160 * will not allow unprivileged parents kill their privileged
1161 * children this way. We rely on the control groups kill logic
1162 * to do the rest for us. */
1163 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1164 goto child_finish;
1165
1166 /* Tell the parent that our setup is done. This is especially
1167 * important regarding dropping privileges. Otherwise, unit
1168 * setup might race against our setresuid(2) call.
1169 *
1170 * If the parent aborted, we'll detect this below, hence ignore
1171 * return failure here. */
1172 (void) barrier_place(&barrier);
1173
1174 /* Check if our parent process might already have died? */
1175 if (getppid() == parent_pid) {
1176 sigset_t ss;
1177
1178 assert_se(sigemptyset(&ss) >= 0);
1179 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1180
1181 for (;;) {
1182 if (sigwait(&ss, &sig) < 0) {
1183 if (errno == EINTR)
1184 continue;
1185
1186 goto child_finish;
1187 }
1188
1189 assert(sig == SIGTERM);
1190 break;
1191 }
1192 }
1193
1194 /* If our parent died we'll end the session */
1195 if (getppid() != parent_pid) {
1196 pam_code = pam_close_session(handle, flags);
1197 if (pam_code != PAM_SUCCESS)
1198 goto child_finish;
1199 }
1200
1201 ret = 0;
1202
1203 child_finish:
1204 pam_end(handle, pam_code | flags);
1205 _exit(ret);
1206 }
1207
1208 barrier_set_role(&barrier, BARRIER_PARENT);
1209
1210 /* If the child was forked off successfully it will do all the
1211 * cleanups, so forget about the handle here. */
1212 handle = NULL;
1213
1214 /* Unblock SIGTERM again in the parent */
1215 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1216
1217 /* We close the log explicitly here, since the PAM modules
1218 * might have opened it, but we don't want this fd around. */
1219 closelog();
1220
1221 /* Synchronously wait for the child to initialize. We don't care for
1222 * errors as we cannot recover. However, warn loudly if it happens. */
1223 if (!barrier_place_and_sync(&barrier))
1224 log_error("PAM initialization failed");
1225
1226 strv_free(*env);
1227 *env = e;
1228
1229 return 0;
1230
1231 fail:
1232 if (pam_code != PAM_SUCCESS) {
1233 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1234 r = -EPERM; /* PAM errors do not map to errno */
1235 } else
1236 log_error_errno(r, "PAM failed: %m");
1237
1238 if (handle) {
1239 if (close_session)
1240 pam_code = pam_close_session(handle, flags);
1241
1242 pam_end(handle, pam_code | flags);
1243 }
1244
1245 strv_free(e);
1246 closelog();
1247
1248 return r;
1249 #else
1250 return 0;
1251 #endif
1252 }
1253
1254 static void rename_process_from_path(const char *path) {
1255 char process_name[11];
1256 const char *p;
1257 size_t l;
1258
1259 /* This resulting string must fit in 10 chars (i.e. the length
1260 * of "/sbin/init") to look pretty in /bin/ps */
1261
1262 p = basename(path);
1263 if (isempty(p)) {
1264 rename_process("(...)");
1265 return;
1266 }
1267
1268 l = strlen(p);
1269 if (l > 8) {
1270 /* The end of the process name is usually more
1271 * interesting, since the first bit might just be
1272 * "systemd-" */
1273 p = p + l - 8;
1274 l = 8;
1275 }
1276
1277 process_name[0] = '(';
1278 memcpy(process_name+1, p, l);
1279 process_name[1+l] = ')';
1280 process_name[1+l+1] = 0;
1281
1282 rename_process(process_name);
1283 }
1284
1285 static bool context_has_address_families(const ExecContext *c) {
1286 assert(c);
1287
1288 return c->address_families_whitelist ||
1289 !set_isempty(c->address_families);
1290 }
1291
1292 static bool context_has_syscall_filters(const ExecContext *c) {
1293 assert(c);
1294
1295 return c->syscall_whitelist ||
1296 !set_isempty(c->syscall_filter);
1297 }
1298
1299 static bool context_has_no_new_privileges(const ExecContext *c) {
1300 assert(c);
1301
1302 if (c->no_new_privileges)
1303 return true;
1304
1305 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1306 return false;
1307
1308 /* We need NNP if we have any form of seccomp and are unprivileged */
1309 return context_has_address_families(c) ||
1310 c->memory_deny_write_execute ||
1311 c->restrict_realtime ||
1312 exec_context_restrict_namespaces_set(c) ||
1313 c->protect_kernel_tunables ||
1314 c->protect_kernel_modules ||
1315 c->private_devices ||
1316 context_has_syscall_filters(c) ||
1317 !set_isempty(c->syscall_archs) ||
1318 c->lock_personality;
1319 }
1320
1321 #if HAVE_SECCOMP
1322
1323 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1324
1325 if (is_seccomp_available())
1326 return false;
1327
1328 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1329 return true;
1330 }
1331
1332 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1333 uint32_t negative_action, default_action, action;
1334 int r;
1335
1336 assert(u);
1337 assert(c);
1338
1339 if (!context_has_syscall_filters(c))
1340 return 0;
1341
1342 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1343 return 0;
1344
1345 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1346
1347 if (c->syscall_whitelist) {
1348 default_action = negative_action;
1349 action = SCMP_ACT_ALLOW;
1350 } else {
1351 default_action = SCMP_ACT_ALLOW;
1352 action = negative_action;
1353 }
1354
1355 if (needs_ambient_hack) {
1356 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1357 if (r < 0)
1358 return r;
1359 }
1360
1361 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1362 }
1363
1364 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1365 assert(u);
1366 assert(c);
1367
1368 if (set_isempty(c->syscall_archs))
1369 return 0;
1370
1371 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1372 return 0;
1373
1374 return seccomp_restrict_archs(c->syscall_archs);
1375 }
1376
1377 static int apply_address_families(const Unit* u, const ExecContext *c) {
1378 assert(u);
1379 assert(c);
1380
1381 if (!context_has_address_families(c))
1382 return 0;
1383
1384 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1385 return 0;
1386
1387 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1388 }
1389
1390 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1391 assert(u);
1392 assert(c);
1393
1394 if (!c->memory_deny_write_execute)
1395 return 0;
1396
1397 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1398 return 0;
1399
1400 return seccomp_memory_deny_write_execute();
1401 }
1402
1403 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1404 assert(u);
1405 assert(c);
1406
1407 if (!c->restrict_realtime)
1408 return 0;
1409
1410 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1411 return 0;
1412
1413 return seccomp_restrict_realtime();
1414 }
1415
1416 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1417 assert(u);
1418 assert(c);
1419
1420 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1421 * let's protect even those systems where this is left on in the kernel. */
1422
1423 if (!c->protect_kernel_tunables)
1424 return 0;
1425
1426 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1427 return 0;
1428
1429 return seccomp_protect_sysctl();
1430 }
1431
1432 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1433 assert(u);
1434 assert(c);
1435
1436 /* Turn off module syscalls on ProtectKernelModules=yes */
1437
1438 if (!c->protect_kernel_modules)
1439 return 0;
1440
1441 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1442 return 0;
1443
1444 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1445 }
1446
1447 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1448 assert(u);
1449 assert(c);
1450
1451 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1452
1453 if (!c->private_devices)
1454 return 0;
1455
1456 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1457 return 0;
1458
1459 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1460 }
1461
1462 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1463 assert(u);
1464 assert(c);
1465
1466 if (!exec_context_restrict_namespaces_set(c))
1467 return 0;
1468
1469 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1470 return 0;
1471
1472 return seccomp_restrict_namespaces(c->restrict_namespaces);
1473 }
1474
1475 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1476 unsigned long personality;
1477 int r;
1478
1479 assert(u);
1480 assert(c);
1481
1482 if (!c->lock_personality)
1483 return 0;
1484
1485 if (skip_seccomp_unavailable(u, "LockPersonality="))
1486 return 0;
1487
1488 personality = c->personality;
1489
1490 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1491 if (personality == PERSONALITY_INVALID) {
1492
1493 r = opinionated_personality(&personality);
1494 if (r < 0)
1495 return r;
1496 }
1497
1498 return seccomp_lock_personality(personality);
1499 }
1500
1501 #endif
1502
1503 static void do_idle_pipe_dance(int idle_pipe[4]) {
1504 assert(idle_pipe);
1505
1506 idle_pipe[1] = safe_close(idle_pipe[1]);
1507 idle_pipe[2] = safe_close(idle_pipe[2]);
1508
1509 if (idle_pipe[0] >= 0) {
1510 int r;
1511
1512 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1513
1514 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1515 ssize_t n;
1516
1517 /* Signal systemd that we are bored and want to continue. */
1518 n = write(idle_pipe[3], "x", 1);
1519 if (n > 0)
1520 /* Wait for systemd to react to the signal above. */
1521 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1522 }
1523
1524 idle_pipe[0] = safe_close(idle_pipe[0]);
1525
1526 }
1527
1528 idle_pipe[3] = safe_close(idle_pipe[3]);
1529 }
1530
1531 static int build_environment(
1532 Unit *u,
1533 const ExecContext *c,
1534 const ExecParameters *p,
1535 unsigned n_fds,
1536 const char *home,
1537 const char *username,
1538 const char *shell,
1539 dev_t journal_stream_dev,
1540 ino_t journal_stream_ino,
1541 char ***ret) {
1542
1543 _cleanup_strv_free_ char **our_env = NULL;
1544 unsigned n_env = 0;
1545 char *x;
1546
1547 assert(u);
1548 assert(c);
1549 assert(ret);
1550
1551 our_env = new0(char*, 14);
1552 if (!our_env)
1553 return -ENOMEM;
1554
1555 if (n_fds > 0) {
1556 _cleanup_free_ char *joined = NULL;
1557
1558 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1559 return -ENOMEM;
1560 our_env[n_env++] = x;
1561
1562 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1563 return -ENOMEM;
1564 our_env[n_env++] = x;
1565
1566 joined = strv_join(p->fd_names, ":");
1567 if (!joined)
1568 return -ENOMEM;
1569
1570 x = strjoin("LISTEN_FDNAMES=", joined);
1571 if (!x)
1572 return -ENOMEM;
1573 our_env[n_env++] = x;
1574 }
1575
1576 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1577 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1578 return -ENOMEM;
1579 our_env[n_env++] = x;
1580
1581 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1582 return -ENOMEM;
1583 our_env[n_env++] = x;
1584 }
1585
1586 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1587 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1588 * check the database directly. */
1589 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1590 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1591 if (!x)
1592 return -ENOMEM;
1593 our_env[n_env++] = x;
1594 }
1595
1596 if (home) {
1597 x = strappend("HOME=", home);
1598 if (!x)
1599 return -ENOMEM;
1600 our_env[n_env++] = x;
1601 }
1602
1603 if (username) {
1604 x = strappend("LOGNAME=", username);
1605 if (!x)
1606 return -ENOMEM;
1607 our_env[n_env++] = x;
1608
1609 x = strappend("USER=", username);
1610 if (!x)
1611 return -ENOMEM;
1612 our_env[n_env++] = x;
1613 }
1614
1615 if (shell) {
1616 x = strappend("SHELL=", shell);
1617 if (!x)
1618 return -ENOMEM;
1619 our_env[n_env++] = x;
1620 }
1621
1622 if (!sd_id128_is_null(u->invocation_id)) {
1623 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1624 return -ENOMEM;
1625
1626 our_env[n_env++] = x;
1627 }
1628
1629 if (exec_context_needs_term(c)) {
1630 const char *tty_path, *term = NULL;
1631
1632 tty_path = exec_context_tty_path(c);
1633
1634 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1635 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1636 * passes to PID 1 ends up all the way in the console login shown. */
1637
1638 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1639 term = getenv("TERM");
1640 if (!term)
1641 term = default_term_for_tty(tty_path);
1642
1643 x = strappend("TERM=", term);
1644 if (!x)
1645 return -ENOMEM;
1646 our_env[n_env++] = x;
1647 }
1648
1649 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1650 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1651 return -ENOMEM;
1652
1653 our_env[n_env++] = x;
1654 }
1655
1656 our_env[n_env++] = NULL;
1657 assert(n_env <= 12);
1658
1659 *ret = our_env;
1660 our_env = NULL;
1661
1662 return 0;
1663 }
1664
1665 static int build_pass_environment(const ExecContext *c, char ***ret) {
1666 _cleanup_strv_free_ char **pass_env = NULL;
1667 size_t n_env = 0, n_bufsize = 0;
1668 char **i;
1669
1670 STRV_FOREACH(i, c->pass_environment) {
1671 _cleanup_free_ char *x = NULL;
1672 char *v;
1673
1674 v = getenv(*i);
1675 if (!v)
1676 continue;
1677 x = strjoin(*i, "=", v);
1678 if (!x)
1679 return -ENOMEM;
1680
1681 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1682 return -ENOMEM;
1683
1684 pass_env[n_env++] = x;
1685 pass_env[n_env] = NULL;
1686 x = NULL;
1687 }
1688
1689 *ret = pass_env;
1690 pass_env = NULL;
1691
1692 return 0;
1693 }
1694
1695 static bool exec_needs_mount_namespace(
1696 const ExecContext *context,
1697 const ExecParameters *params,
1698 ExecRuntime *runtime) {
1699
1700 assert(context);
1701 assert(params);
1702
1703 if (context->root_image)
1704 return true;
1705
1706 if (!strv_isempty(context->read_write_paths) ||
1707 !strv_isempty(context->read_only_paths) ||
1708 !strv_isempty(context->inaccessible_paths))
1709 return true;
1710
1711 if (context->n_bind_mounts > 0)
1712 return true;
1713
1714 if (context->mount_flags != 0)
1715 return true;
1716
1717 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1718 return true;
1719
1720 if (context->private_devices ||
1721 context->protect_system != PROTECT_SYSTEM_NO ||
1722 context->protect_home != PROTECT_HOME_NO ||
1723 context->protect_kernel_tunables ||
1724 context->protect_kernel_modules ||
1725 context->protect_control_groups)
1726 return true;
1727
1728 if (context->mount_apivfs && (context->root_image || context->root_directory))
1729 return true;
1730
1731 if (context->dynamic_user &&
1732 (!strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
1733 !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1734 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1735 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1736 return true;
1737
1738 return false;
1739 }
1740
1741 static int setup_private_users(uid_t uid, gid_t gid) {
1742 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1743 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1744 _cleanup_close_ int unshare_ready_fd = -1;
1745 _cleanup_(sigkill_waitp) pid_t pid = 0;
1746 uint64_t c = 1;
1747 siginfo_t si;
1748 ssize_t n;
1749 int r;
1750
1751 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1752 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1753 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1754 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1755 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1756 * continues execution normally. */
1757
1758 if (uid != 0 && uid_is_valid(uid)) {
1759 r = asprintf(&uid_map,
1760 "0 0 1\n" /* Map root → root */
1761 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1762 uid, uid);
1763 if (r < 0)
1764 return -ENOMEM;
1765 } else {
1766 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1767 if (!uid_map)
1768 return -ENOMEM;
1769 }
1770
1771 if (gid != 0 && gid_is_valid(gid)) {
1772 r = asprintf(&gid_map,
1773 "0 0 1\n" /* Map root → root */
1774 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1775 gid, gid);
1776 if (r < 0)
1777 return -ENOMEM;
1778 } else {
1779 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1780 if (!gid_map)
1781 return -ENOMEM;
1782 }
1783
1784 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1785 * namespace. */
1786 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1787 if (unshare_ready_fd < 0)
1788 return -errno;
1789
1790 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1791 * failed. */
1792 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1793 return -errno;
1794
1795 pid = fork();
1796 if (pid < 0)
1797 return -errno;
1798
1799 if (pid == 0) {
1800 _cleanup_close_ int fd = -1;
1801 const char *a;
1802 pid_t ppid;
1803
1804 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1805 * here, after the parent opened its own user namespace. */
1806
1807 ppid = getppid();
1808 errno_pipe[0] = safe_close(errno_pipe[0]);
1809
1810 /* Wait until the parent unshared the user namespace */
1811 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1812 r = -errno;
1813 goto child_fail;
1814 }
1815
1816 /* Disable the setgroups() system call in the child user namespace, for good. */
1817 a = procfs_file_alloca(ppid, "setgroups");
1818 fd = open(a, O_WRONLY|O_CLOEXEC);
1819 if (fd < 0) {
1820 if (errno != ENOENT) {
1821 r = -errno;
1822 goto child_fail;
1823 }
1824
1825 /* If the file is missing the kernel is too old, let's continue anyway. */
1826 } else {
1827 if (write(fd, "deny\n", 5) < 0) {
1828 r = -errno;
1829 goto child_fail;
1830 }
1831
1832 fd = safe_close(fd);
1833 }
1834
1835 /* First write the GID map */
1836 a = procfs_file_alloca(ppid, "gid_map");
1837 fd = open(a, O_WRONLY|O_CLOEXEC);
1838 if (fd < 0) {
1839 r = -errno;
1840 goto child_fail;
1841 }
1842 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1843 r = -errno;
1844 goto child_fail;
1845 }
1846 fd = safe_close(fd);
1847
1848 /* The write the UID map */
1849 a = procfs_file_alloca(ppid, "uid_map");
1850 fd = open(a, O_WRONLY|O_CLOEXEC);
1851 if (fd < 0) {
1852 r = -errno;
1853 goto child_fail;
1854 }
1855 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1856 r = -errno;
1857 goto child_fail;
1858 }
1859
1860 _exit(EXIT_SUCCESS);
1861
1862 child_fail:
1863 (void) write(errno_pipe[1], &r, sizeof(r));
1864 _exit(EXIT_FAILURE);
1865 }
1866
1867 errno_pipe[1] = safe_close(errno_pipe[1]);
1868
1869 if (unshare(CLONE_NEWUSER) < 0)
1870 return -errno;
1871
1872 /* Let the child know that the namespace is ready now */
1873 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1874 return -errno;
1875
1876 /* Try to read an error code from the child */
1877 n = read(errno_pipe[0], &r, sizeof(r));
1878 if (n < 0)
1879 return -errno;
1880 if (n == sizeof(r)) { /* an error code was sent to us */
1881 if (r < 0)
1882 return r;
1883 return -EIO;
1884 }
1885 if (n != 0) /* on success we should have read 0 bytes */
1886 return -EIO;
1887
1888 r = wait_for_terminate(pid, &si);
1889 if (r < 0)
1890 return r;
1891 pid = 0;
1892
1893 /* If something strange happened with the child, let's consider this fatal, too */
1894 if (si.si_code != CLD_EXITED || si.si_status != 0)
1895 return -EIO;
1896
1897 return 0;
1898 }
1899
1900 static int setup_exec_directory(
1901 const ExecContext *context,
1902 const ExecParameters *params,
1903 uid_t uid,
1904 gid_t gid,
1905 ExecDirectoryType type,
1906 int *exit_status) {
1907
1908 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1909 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1910 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1911 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1912 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1913 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1914 };
1915 char **rt;
1916 int r;
1917
1918 assert(context);
1919 assert(params);
1920 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1921 assert(exit_status);
1922
1923 if (!params->prefix[type])
1924 return 0;
1925
1926 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1927 if (!uid_is_valid(uid))
1928 uid = 0;
1929 if (!gid_is_valid(gid))
1930 gid = 0;
1931 }
1932
1933 STRV_FOREACH(rt, context->directories[type].paths) {
1934 _cleanup_free_ char *p = NULL, *pp = NULL;
1935 const char *effective;
1936
1937 p = strjoin(params->prefix[type], "/", *rt);
1938 if (!p) {
1939 r = -ENOMEM;
1940 goto fail;
1941 }
1942
1943 r = mkdir_parents_label(p, 0755);
1944 if (r < 0)
1945 goto fail;
1946
1947 if (context->dynamic_user && type != EXEC_DIRECTORY_CONFIGURATION) {
1948 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
1949
1950 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1951 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1952 * whose UID is later on reused. To lock this down we use the same trick used by container
1953 * managers to prohibit host users to get access to files of the same UID in containers: we
1954 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1955 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1956 * to make this directory permeable for the service itself.
1957 *
1958 * Specifically: for a service which wants a special directory "foo/" we first create a
1959 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1960 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1961 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1962 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1963 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1964 * disabling the access boundary for the service and making sure it only gets access to the
1965 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1966 *
1967 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1968 * owned by the service itself. */
1969
1970 private_root = strjoin(params->prefix[type], "/private");
1971 if (!private_root) {
1972 r = -ENOMEM;
1973 goto fail;
1974 }
1975
1976 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1977 r = mkdir_safe_label(private_root, 0700, 0, 0);
1978 if (r < 0)
1979 goto fail;
1980
1981 pp = strjoin(private_root, "/", *rt);
1982 if (!pp) {
1983 r = -ENOMEM;
1984 goto fail;
1985 }
1986
1987 /* Create all directories between the configured directory and this private root, and mark them 0755 */
1988 r = mkdir_parents_label(pp, 0755);
1989 if (r < 0)
1990 goto fail;
1991
1992 /* Finally, create the actual directory for the service */
1993 r = mkdir_label(pp, context->directories[type].mode);
1994 if (r < 0 && r != -EEXIST)
1995 goto fail;
1996
1997 parent = dirname_malloc(p);
1998 if (!parent) {
1999 r = -ENOMEM;
2000 goto fail;
2001 }
2002
2003 r = path_make_relative(parent, pp, &relative);
2004 if (r < 0)
2005 goto fail;
2006
2007 /* And link it up from the original place */
2008 r = symlink_idempotent(relative, p);
2009 if (r < 0)
2010 goto fail;
2011
2012 effective = pp;
2013
2014 } else {
2015 r = mkdir_label(p, context->directories[type].mode);
2016 if (r < 0 && r != -EEXIST)
2017 goto fail;
2018
2019 effective = p;
2020 }
2021
2022 /* First lock down the access mode */
2023 if (chmod(effective, context->directories[type].mode) < 0) {
2024 r = -errno;
2025 goto fail;
2026 }
2027
2028 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2029 * a service, and shall not be writable. */
2030 if (type == EXEC_DIRECTORY_CONFIGURATION)
2031 continue;
2032
2033 /* Then, change the ownership of the whole tree, if necessary */
2034 r = path_chown_recursive(effective, uid, gid);
2035 if (r < 0)
2036 goto fail;
2037 }
2038
2039 return 0;
2040
2041 fail:
2042 *exit_status = exit_status_table[type];
2043 return r;
2044 }
2045
2046 static int setup_smack(
2047 const ExecContext *context,
2048 const ExecCommand *command) {
2049
2050 int r;
2051
2052 assert(context);
2053 assert(command);
2054
2055 if (context->smack_process_label) {
2056 r = mac_smack_apply_pid(0, context->smack_process_label);
2057 if (r < 0)
2058 return r;
2059 }
2060 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2061 else {
2062 _cleanup_free_ char *exec_label = NULL;
2063
2064 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2065 if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
2066 return r;
2067
2068 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2069 if (r < 0)
2070 return r;
2071 }
2072 #endif
2073
2074 return 0;
2075 }
2076
2077 static int compile_read_write_paths(
2078 const ExecContext *context,
2079 const ExecParameters *params,
2080 char ***ret) {
2081
2082 _cleanup_strv_free_ char **l = NULL;
2083 char **rt;
2084 ExecDirectoryType i;
2085
2086 /* Compile the list of writable paths. This is the combination of
2087 * the explicitly configured paths, plus all runtime directories. */
2088
2089 if (strv_isempty(context->read_write_paths)) {
2090 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
2091 if (!strv_isempty(context->directories[i].paths))
2092 break;
2093
2094 if (i == _EXEC_DIRECTORY_TYPE_MAX) {
2095 *ret = NULL; /* NOP if neither is set */
2096 return 0;
2097 }
2098 }
2099
2100 l = strv_copy(context->read_write_paths);
2101 if (!l)
2102 return -ENOMEM;
2103
2104 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++) {
2105 if (!params->prefix[i])
2106 continue;
2107
2108 STRV_FOREACH(rt, context->directories[i].paths) {
2109 char *s;
2110
2111 s = strjoin(params->prefix[i], "/", *rt);
2112 if (!s)
2113 return -ENOMEM;
2114
2115 if (strv_consume(&l, s) < 0)
2116 return -ENOMEM;
2117 }
2118 }
2119
2120 *ret = l;
2121 l = NULL;
2122
2123 return 0;
2124 }
2125
2126 static int compile_bind_mounts(
2127 const ExecContext *context,
2128 const ExecParameters *params,
2129 BindMount **ret_bind_mounts,
2130 unsigned *ret_n_bind_mounts,
2131 char ***ret_empty_directories) {
2132
2133 _cleanup_strv_free_ char **empty_directories = NULL;
2134 BindMount *bind_mounts;
2135 unsigned n, h = 0, i;
2136 ExecDirectoryType t;
2137 int r;
2138
2139 assert(context);
2140 assert(params);
2141 assert(ret_bind_mounts);
2142 assert(ret_n_bind_mounts);
2143 assert(ret_empty_directories);
2144
2145 n = context->n_bind_mounts;
2146 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2147 if (!params->prefix[t])
2148 continue;
2149
2150 n += strv_length(context->directories[t].paths);
2151 }
2152
2153 if (n <= 0) {
2154 *ret_bind_mounts = NULL;
2155 *ret_n_bind_mounts = 0;
2156 *ret_empty_directories = NULL;
2157 return 0;
2158 }
2159
2160 bind_mounts = new(BindMount, n);
2161 if (!bind_mounts)
2162 return -ENOMEM;
2163
2164 for (i = 0; context->n_bind_mounts; i++) {
2165 BindMount *item = context->bind_mounts + i;
2166 char *s, *d;
2167
2168 s = strdup(item->source);
2169 if (!s) {
2170 r = -ENOMEM;
2171 goto finish;
2172 }
2173
2174 d = strdup(item->destination);
2175 if (!d) {
2176 free(s);
2177 r = -ENOMEM;
2178 goto finish;
2179 }
2180
2181 bind_mounts[h++] = (BindMount) {
2182 .source = s,
2183 .destination = d,
2184 .read_only = item->read_only,
2185 .recursive = item->recursive,
2186 .ignore_enoent = item->ignore_enoent,
2187 };
2188 }
2189
2190 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2191 char **suffix;
2192
2193 if (!params->prefix[t])
2194 continue;
2195
2196 if (strv_isempty(context->directories[t].paths))
2197 continue;
2198
2199 if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION) {
2200 char *private_root;
2201
2202 /* So this is for a dynamic user, and we need to make sure the process can access its own
2203 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2204 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2205
2206 private_root = strjoin(params->prefix[t], "/private");
2207 if (!private_root) {
2208 r = -ENOMEM;
2209 goto finish;
2210 }
2211
2212 r = strv_consume(&empty_directories, private_root);
2213 if (r < 0) {
2214 r = -ENOMEM;
2215 goto finish;
2216 }
2217 }
2218
2219 STRV_FOREACH(suffix, context->directories[t].paths) {
2220 char *s, *d;
2221
2222 if (context->dynamic_user && t != EXEC_DIRECTORY_CONFIGURATION)
2223 s = strjoin(params->prefix[t], "/private/", *suffix);
2224 else
2225 s = strjoin(params->prefix[t], "/", *suffix);
2226 if (!s) {
2227 r = -ENOMEM;
2228 goto finish;
2229 }
2230
2231 d = strdup(s);
2232 if (!d) {
2233 free(s);
2234 r = -ENOMEM;
2235 goto finish;
2236 }
2237
2238 bind_mounts[h++] = (BindMount) {
2239 .source = s,
2240 .destination = d,
2241 .read_only = false,
2242 .recursive = true,
2243 .ignore_enoent = false,
2244 };
2245 }
2246 }
2247
2248 assert(h == n);
2249
2250 *ret_bind_mounts = bind_mounts;
2251 *ret_n_bind_mounts = n;
2252 *ret_empty_directories = empty_directories;
2253
2254 empty_directories = NULL;
2255
2256 return (int) n;
2257
2258 finish:
2259 bind_mount_free_many(bind_mounts, h);
2260 return r;
2261 }
2262
2263 static int apply_mount_namespace(
2264 Unit *u,
2265 ExecCommand *command,
2266 const ExecContext *context,
2267 const ExecParameters *params,
2268 ExecRuntime *runtime) {
2269
2270 _cleanup_strv_free_ char **rw = NULL, **empty_directories = NULL;
2271 char *tmp = NULL, *var = NULL;
2272 const char *root_dir = NULL, *root_image = NULL;
2273 NameSpaceInfo ns_info = {
2274 .ignore_protect_paths = false,
2275 .private_dev = context->private_devices,
2276 .protect_control_groups = context->protect_control_groups,
2277 .protect_kernel_tunables = context->protect_kernel_tunables,
2278 .protect_kernel_modules = context->protect_kernel_modules,
2279 .mount_apivfs = context->mount_apivfs,
2280 };
2281 bool needs_sandboxing;
2282 BindMount *bind_mounts = NULL;
2283 unsigned n_bind_mounts = 0;
2284 int r;
2285
2286 assert(context);
2287
2288 /* The runtime struct only contains the parent of the private /tmp,
2289 * which is non-accessible to world users. Inside of it there's a /tmp
2290 * that is sticky, and that's the one we want to use here. */
2291
2292 if (context->private_tmp && runtime) {
2293 if (runtime->tmp_dir)
2294 tmp = strjoina(runtime->tmp_dir, "/tmp");
2295 if (runtime->var_tmp_dir)
2296 var = strjoina(runtime->var_tmp_dir, "/tmp");
2297 }
2298
2299 r = compile_read_write_paths(context, params, &rw);
2300 if (r < 0)
2301 return r;
2302
2303 if (params->flags & EXEC_APPLY_CHROOT) {
2304 root_image = context->root_image;
2305
2306 if (!root_image)
2307 root_dir = context->root_directory;
2308 }
2309
2310 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2311 if (r < 0)
2312 return r;
2313
2314 /*
2315 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2316 * sandbox info, otherwise enforce it, don't ignore protected paths and
2317 * fail if we are enable to apply the sandbox inside the mount namespace.
2318 */
2319 if (!context->dynamic_user && root_dir)
2320 ns_info.ignore_protect_paths = true;
2321
2322 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2323
2324 r = setup_namespace(root_dir, root_image,
2325 &ns_info, rw,
2326 needs_sandboxing ? context->read_only_paths : NULL,
2327 needs_sandboxing ? context->inaccessible_paths : NULL,
2328 empty_directories,
2329 bind_mounts,
2330 n_bind_mounts,
2331 tmp,
2332 var,
2333 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2334 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2335 context->mount_flags,
2336 DISSECT_IMAGE_DISCARD_ON_LOOP);
2337
2338 bind_mount_free_many(bind_mounts, n_bind_mounts);
2339
2340 /* If we couldn't set up the namespace this is probably due to a
2341 * missing capability. In this case, silently proceeed. */
2342 if (IN_SET(r, -EPERM, -EACCES)) {
2343 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2344 return 0;
2345 }
2346
2347 return r;
2348 }
2349
2350 static int apply_working_directory(
2351 const ExecContext *context,
2352 const ExecParameters *params,
2353 const char *home,
2354 const bool needs_mount_ns,
2355 int *exit_status) {
2356
2357 const char *d, *wd;
2358
2359 assert(context);
2360 assert(exit_status);
2361
2362 if (context->working_directory_home) {
2363
2364 if (!home) {
2365 *exit_status = EXIT_CHDIR;
2366 return -ENXIO;
2367 }
2368
2369 wd = home;
2370
2371 } else if (context->working_directory)
2372 wd = context->working_directory;
2373 else
2374 wd = "/";
2375
2376 if (params->flags & EXEC_APPLY_CHROOT) {
2377 if (!needs_mount_ns && context->root_directory)
2378 if (chroot(context->root_directory) < 0) {
2379 *exit_status = EXIT_CHROOT;
2380 return -errno;
2381 }
2382
2383 d = wd;
2384 } else
2385 d = prefix_roota(context->root_directory, wd);
2386
2387 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2388 *exit_status = EXIT_CHDIR;
2389 return -errno;
2390 }
2391
2392 return 0;
2393 }
2394
2395 static int setup_keyring(
2396 Unit *u,
2397 const ExecContext *context,
2398 const ExecParameters *p,
2399 uid_t uid, gid_t gid) {
2400
2401 key_serial_t keyring;
2402 int r;
2403
2404 assert(u);
2405 assert(context);
2406 assert(p);
2407
2408 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2409 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2410 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2411 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2412 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2413 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2414
2415 if (!(p->flags & EXEC_NEW_KEYRING))
2416 return 0;
2417
2418 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2419 return 0;
2420
2421 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2422 if (keyring == -1) {
2423 if (errno == ENOSYS)
2424 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2425 else if (IN_SET(errno, EACCES, EPERM))
2426 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2427 else if (errno == EDQUOT)
2428 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2429 else
2430 return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2431
2432 return 0;
2433 }
2434
2435 /* Populate they keyring with the invocation ID by default. */
2436 if (!sd_id128_is_null(u->invocation_id)) {
2437 key_serial_t key;
2438
2439 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2440 if (key == -1)
2441 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2442 else {
2443 if (keyctl(KEYCTL_SETPERM, key,
2444 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2445 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2446 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2447 }
2448 }
2449
2450 /* And now, make the keyring owned by the service's user */
2451 if (uid_is_valid(uid) || gid_is_valid(gid))
2452 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2453 return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2454
2455 /* When requested link the user keyring into the session keyring. */
2456 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2457 uid_t saved_uid;
2458 gid_t saved_gid;
2459
2460 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2461 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2462 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2463
2464 saved_uid = getuid();
2465 saved_gid = getgid();
2466
2467 if (gid_is_valid(gid) && gid != saved_gid) {
2468 if (setregid(gid, -1) < 0)
2469 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2470 }
2471
2472 if (uid_is_valid(uid) && uid != saved_uid) {
2473 if (setreuid(uid, -1) < 0) {
2474 (void) setregid(saved_gid, -1);
2475 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2476 }
2477 }
2478
2479 if (keyctl(KEYCTL_LINK,
2480 KEY_SPEC_USER_KEYRING,
2481 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2482
2483 r = -errno;
2484
2485 (void) setreuid(saved_uid, -1);
2486 (void) setregid(saved_gid, -1);
2487
2488 return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2489 }
2490
2491 if (uid_is_valid(uid) && uid != saved_uid) {
2492 if (setreuid(saved_uid, -1) < 0) {
2493 (void) setregid(saved_gid, -1);
2494 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2495 }
2496 }
2497
2498 if (gid_is_valid(gid) && gid != saved_gid) {
2499 if (setregid(saved_gid, -1) < 0)
2500 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2501 }
2502 }
2503
2504 return 0;
2505 }
2506
2507 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2508 assert(array);
2509 assert(n);
2510
2511 if (!pair)
2512 return;
2513
2514 if (pair[0] >= 0)
2515 array[(*n)++] = pair[0];
2516 if (pair[1] >= 0)
2517 array[(*n)++] = pair[1];
2518 }
2519
2520 static int close_remaining_fds(
2521 const ExecParameters *params,
2522 ExecRuntime *runtime,
2523 DynamicCreds *dcreds,
2524 int user_lookup_fd,
2525 int socket_fd,
2526 int *fds, unsigned n_fds) {
2527
2528 unsigned n_dont_close = 0;
2529 int dont_close[n_fds + 12];
2530
2531 assert(params);
2532
2533 if (params->stdin_fd >= 0)
2534 dont_close[n_dont_close++] = params->stdin_fd;
2535 if (params->stdout_fd >= 0)
2536 dont_close[n_dont_close++] = params->stdout_fd;
2537 if (params->stderr_fd >= 0)
2538 dont_close[n_dont_close++] = params->stderr_fd;
2539
2540 if (socket_fd >= 0)
2541 dont_close[n_dont_close++] = socket_fd;
2542 if (n_fds > 0) {
2543 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2544 n_dont_close += n_fds;
2545 }
2546
2547 if (runtime)
2548 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2549
2550 if (dcreds) {
2551 if (dcreds->user)
2552 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2553 if (dcreds->group)
2554 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2555 }
2556
2557 if (user_lookup_fd >= 0)
2558 dont_close[n_dont_close++] = user_lookup_fd;
2559
2560 return close_all_fds(dont_close, n_dont_close);
2561 }
2562
2563 static int send_user_lookup(
2564 Unit *unit,
2565 int user_lookup_fd,
2566 uid_t uid,
2567 gid_t gid) {
2568
2569 assert(unit);
2570
2571 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2572 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2573 * specified. */
2574
2575 if (user_lookup_fd < 0)
2576 return 0;
2577
2578 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2579 return 0;
2580
2581 if (writev(user_lookup_fd,
2582 (struct iovec[]) {
2583 IOVEC_INIT(&uid, sizeof(uid)),
2584 IOVEC_INIT(&gid, sizeof(gid)),
2585 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2586 return -errno;
2587
2588 return 0;
2589 }
2590
2591 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2592 int r;
2593
2594 assert(c);
2595 assert(home);
2596 assert(buf);
2597
2598 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2599
2600 if (*home)
2601 return 0;
2602
2603 if (!c->working_directory_home)
2604 return 0;
2605
2606 if (uid == 0) {
2607 /* Hardcode /root as home directory for UID 0 */
2608 *home = "/root";
2609 return 1;
2610 }
2611
2612 r = get_home_dir(buf);
2613 if (r < 0)
2614 return r;
2615
2616 *home = *buf;
2617 return 1;
2618 }
2619
2620 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2621 _cleanup_strv_free_ char ** list = NULL;
2622 ExecDirectoryType t;
2623 int r;
2624
2625 assert(c);
2626 assert(p);
2627 assert(ret);
2628
2629 assert(c->dynamic_user);
2630
2631 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2632 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2633 * directories. */
2634
2635 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2636 char **i;
2637
2638 if (t == EXEC_DIRECTORY_CONFIGURATION)
2639 continue;
2640
2641 if (!p->prefix[t])
2642 continue;
2643
2644 STRV_FOREACH(i, c->directories[t].paths) {
2645 char *e;
2646
2647 e = strjoin(p->prefix[t], "/private/", *i);
2648 if (!e)
2649 return -ENOMEM;
2650
2651 r = strv_consume(&list, e);
2652 if (r < 0)
2653 return r;
2654 }
2655 }
2656
2657 *ret = list;
2658 list = NULL;
2659
2660 return 0;
2661 }
2662
2663 static int exec_child(
2664 Unit *unit,
2665 ExecCommand *command,
2666 const ExecContext *context,
2667 const ExecParameters *params,
2668 ExecRuntime *runtime,
2669 DynamicCreds *dcreds,
2670 char **argv,
2671 int socket_fd,
2672 int named_iofds[3],
2673 int *fds,
2674 unsigned n_storage_fds,
2675 unsigned n_socket_fds,
2676 char **files_env,
2677 int user_lookup_fd,
2678 int *exit_status) {
2679
2680 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2681 _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2682 _cleanup_free_ gid_t *supplementary_gids = NULL;
2683 const char *username = NULL, *groupname = NULL;
2684 const char *home = NULL, *shell = NULL;
2685 dev_t journal_stream_dev = 0;
2686 ino_t journal_stream_ino = 0;
2687 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2688 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2689 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2690 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2691 #if HAVE_SELINUX
2692 bool use_selinux = false;
2693 #endif
2694 #if HAVE_SMACK
2695 bool use_smack = false;
2696 #endif
2697 #if HAVE_APPARMOR
2698 bool use_apparmor = false;
2699 #endif
2700 uid_t uid = UID_INVALID;
2701 gid_t gid = GID_INVALID;
2702 int i, r, ngids = 0;
2703 unsigned n_fds;
2704 ExecDirectoryType dt;
2705 int secure_bits;
2706
2707 assert(unit);
2708 assert(command);
2709 assert(context);
2710 assert(params);
2711 assert(exit_status);
2712
2713 rename_process_from_path(command->path);
2714
2715 /* We reset exactly these signals, since they are the
2716 * only ones we set to SIG_IGN in the main daemon. All
2717 * others we leave untouched because we set them to
2718 * SIG_DFL or a valid handler initially, both of which
2719 * will be demoted to SIG_DFL. */
2720 (void) default_signals(SIGNALS_CRASH_HANDLER,
2721 SIGNALS_IGNORE, -1);
2722
2723 if (context->ignore_sigpipe)
2724 (void) ignore_signals(SIGPIPE, -1);
2725
2726 r = reset_signal_mask();
2727 if (r < 0) {
2728 *exit_status = EXIT_SIGNAL_MASK;
2729 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2730 }
2731
2732 if (params->idle_pipe)
2733 do_idle_pipe_dance(params->idle_pipe);
2734
2735 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2736 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2737 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2738 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2739
2740 log_forget_fds();
2741 log_set_open_when_needed(true);
2742
2743 /* In case anything used libc syslog(), close this here, too */
2744 closelog();
2745
2746 n_fds = n_storage_fds + n_socket_fds;
2747 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2748 if (r < 0) {
2749 *exit_status = EXIT_FDS;
2750 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2751 }
2752
2753 if (!context->same_pgrp)
2754 if (setsid() < 0) {
2755 *exit_status = EXIT_SETSID;
2756 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2757 }
2758
2759 exec_context_tty_reset(context, params);
2760
2761 if (unit_shall_confirm_spawn(unit)) {
2762 const char *vc = params->confirm_spawn;
2763 _cleanup_free_ char *cmdline = NULL;
2764
2765 cmdline = exec_command_line(argv);
2766 if (!cmdline) {
2767 *exit_status = EXIT_MEMORY;
2768 return log_oom();
2769 }
2770
2771 r = ask_for_confirmation(vc, unit, cmdline);
2772 if (r != CONFIRM_EXECUTE) {
2773 if (r == CONFIRM_PRETEND_SUCCESS) {
2774 *exit_status = EXIT_SUCCESS;
2775 return 0;
2776 }
2777 *exit_status = EXIT_CONFIRM;
2778 log_unit_error(unit, "Execution cancelled by the user");
2779 return -ECANCELED;
2780 }
2781 }
2782
2783 if (context->dynamic_user && dcreds) {
2784 _cleanup_strv_free_ char **suggested_paths = NULL;
2785
2786 /* Make sure we bypass our own NSS module for any NSS checks */
2787 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2788 *exit_status = EXIT_USER;
2789 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2790 }
2791
2792 r = compile_suggested_paths(context, params, &suggested_paths);
2793 if (r < 0) {
2794 *exit_status = EXIT_MEMORY;
2795 return log_oom();
2796 }
2797
2798 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2799 if (r < 0) {
2800 *exit_status = EXIT_USER;
2801 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2802 }
2803
2804 if (!uid_is_valid(uid)) {
2805 *exit_status = EXIT_USER;
2806 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2807 return -ESRCH;
2808 }
2809
2810 if (!gid_is_valid(gid)) {
2811 *exit_status = EXIT_USER;
2812 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2813 return -ESRCH;
2814 }
2815
2816 if (dcreds->user)
2817 username = dcreds->user->name;
2818
2819 } else {
2820 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2821 if (r < 0) {
2822 *exit_status = EXIT_USER;
2823 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2824 }
2825
2826 r = get_fixed_group(context, &groupname, &gid);
2827 if (r < 0) {
2828 *exit_status = EXIT_GROUP;
2829 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2830 }
2831 }
2832
2833 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2834 r = get_supplementary_groups(context, username, groupname, gid,
2835 &supplementary_gids, &ngids);
2836 if (r < 0) {
2837 *exit_status = EXIT_GROUP;
2838 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2839 }
2840
2841 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2842 if (r < 0) {
2843 *exit_status = EXIT_USER;
2844 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2845 }
2846
2847 user_lookup_fd = safe_close(user_lookup_fd);
2848
2849 r = acquire_home(context, uid, &home, &home_buffer);
2850 if (r < 0) {
2851 *exit_status = EXIT_CHDIR;
2852 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2853 }
2854
2855 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2856 * must sure to drop O_NONBLOCK */
2857 if (socket_fd >= 0)
2858 (void) fd_nonblock(socket_fd, false);
2859
2860 r = setup_input(context, params, socket_fd, named_iofds);
2861 if (r < 0) {
2862 *exit_status = EXIT_STDIN;
2863 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2864 }
2865
2866 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2867 if (r < 0) {
2868 *exit_status = EXIT_STDOUT;
2869 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2870 }
2871
2872 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2873 if (r < 0) {
2874 *exit_status = EXIT_STDERR;
2875 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2876 }
2877
2878 if (params->cgroup_path) {
2879 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2880 if (r < 0) {
2881 *exit_status = EXIT_CGROUP;
2882 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2883 }
2884 }
2885
2886 if (context->oom_score_adjust_set) {
2887 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2888
2889 /* When we can't make this change due to EPERM, then
2890 * let's silently skip over it. User namespaces
2891 * prohibit write access to this file, and we
2892 * shouldn't trip up over that. */
2893
2894 sprintf(t, "%i", context->oom_score_adjust);
2895 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2896 if (IN_SET(r, -EPERM, -EACCES))
2897 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2898 else if (r < 0) {
2899 *exit_status = EXIT_OOM_ADJUST;
2900 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2901 }
2902 }
2903
2904 if (context->nice_set)
2905 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2906 *exit_status = EXIT_NICE;
2907 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2908 }
2909
2910 if (context->cpu_sched_set) {
2911 struct sched_param param = {
2912 .sched_priority = context->cpu_sched_priority,
2913 };
2914
2915 r = sched_setscheduler(0,
2916 context->cpu_sched_policy |
2917 (context->cpu_sched_reset_on_fork ?
2918 SCHED_RESET_ON_FORK : 0),
2919 &param);
2920 if (r < 0) {
2921 *exit_status = EXIT_SETSCHEDULER;
2922 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2923 }
2924 }
2925
2926 if (context->cpuset)
2927 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2928 *exit_status = EXIT_CPUAFFINITY;
2929 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2930 }
2931
2932 if (context->ioprio_set)
2933 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2934 *exit_status = EXIT_IOPRIO;
2935 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2936 }
2937
2938 if (context->timer_slack_nsec != NSEC_INFINITY)
2939 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2940 *exit_status = EXIT_TIMERSLACK;
2941 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2942 }
2943
2944 if (context->personality != PERSONALITY_INVALID) {
2945 r = safe_personality(context->personality);
2946 if (r < 0) {
2947 *exit_status = EXIT_PERSONALITY;
2948 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2949 }
2950 }
2951
2952 if (context->utmp_id)
2953 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2954 context->tty_path,
2955 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
2956 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2957 USER_PROCESS,
2958 username);
2959
2960 if (context->user) {
2961 r = chown_terminal(STDIN_FILENO, uid);
2962 if (r < 0) {
2963 *exit_status = EXIT_STDIN;
2964 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2965 }
2966 }
2967
2968 /* If delegation is enabled we'll pass ownership of the cgroup
2969 * (but only in systemd's own controller hierarchy!) to the
2970 * user of the new process. */
2971 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2972 r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
2973 if (r < 0) {
2974 *exit_status = EXIT_CGROUP;
2975 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2976 }
2977
2978 r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
2979 if (r < 0) {
2980 *exit_status = EXIT_CGROUP;
2981 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2982 }
2983 }
2984
2985 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2986 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
2987 if (r < 0)
2988 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
2989 }
2990
2991 r = build_environment(
2992 unit,
2993 context,
2994 params,
2995 n_fds,
2996 home,
2997 username,
2998 shell,
2999 journal_stream_dev,
3000 journal_stream_ino,
3001 &our_env);
3002 if (r < 0) {
3003 *exit_status = EXIT_MEMORY;
3004 return log_oom();
3005 }
3006
3007 r = build_pass_environment(context, &pass_env);
3008 if (r < 0) {
3009 *exit_status = EXIT_MEMORY;
3010 return log_oom();
3011 }
3012
3013 accum_env = strv_env_merge(5,
3014 params->environment,
3015 our_env,
3016 pass_env,
3017 context->environment,
3018 files_env,
3019 NULL);
3020 if (!accum_env) {
3021 *exit_status = EXIT_MEMORY;
3022 return log_oom();
3023 }
3024 accum_env = strv_env_clean(accum_env);
3025
3026 (void) umask(context->umask);
3027
3028 r = setup_keyring(unit, context, params, uid, gid);
3029 if (r < 0) {
3030 *exit_status = EXIT_KEYRING;
3031 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3032 }
3033
3034 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3035 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3036
3037 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3038 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3039
3040 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3041 if (needs_ambient_hack)
3042 needs_setuid = false;
3043 else
3044 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3045
3046 if (needs_sandboxing) {
3047 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3048 * present. The actual MAC context application will happen later, as late as possible, to avoid
3049 * impacting our own code paths. */
3050
3051 #if HAVE_SELINUX
3052 use_selinux = mac_selinux_use();
3053 #endif
3054 #if HAVE_SMACK
3055 use_smack = mac_smack_use();
3056 #endif
3057 #if HAVE_APPARMOR
3058 use_apparmor = mac_apparmor_use();
3059 #endif
3060 }
3061
3062 if (needs_setuid) {
3063 if (context->pam_name && username) {
3064 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3065 if (r < 0) {
3066 *exit_status = EXIT_PAM;
3067 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3068 }
3069 }
3070 }
3071
3072 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3073 r = setup_netns(runtime->netns_storage_socket);
3074 if (r < 0) {
3075 *exit_status = EXIT_NETWORK;
3076 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3077 }
3078 }
3079
3080 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3081 if (needs_mount_namespace) {
3082 r = apply_mount_namespace(unit, command, context, params, runtime);
3083 if (r < 0) {
3084 *exit_status = EXIT_NAMESPACE;
3085 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3086 }
3087 }
3088
3089 /* Apply just after mount namespace setup */
3090 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3091 if (r < 0)
3092 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3093
3094 /* Drop groups as early as possbile */
3095 if (needs_setuid) {
3096 r = enforce_groups(context, gid, supplementary_gids, ngids);
3097 if (r < 0) {
3098 *exit_status = EXIT_GROUP;
3099 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3100 }
3101 }
3102
3103 if (needs_sandboxing) {
3104 #if HAVE_SELINUX
3105 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3106 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3107 if (r < 0) {
3108 *exit_status = EXIT_SELINUX_CONTEXT;
3109 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3110 }
3111 }
3112 #endif
3113
3114 if (context->private_users) {
3115 r = setup_private_users(uid, gid);
3116 if (r < 0) {
3117 *exit_status = EXIT_USER;
3118 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3119 }
3120 }
3121 }
3122
3123 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3124 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3125 * was needed to upload the policy and can now be closed as well. */
3126 r = close_all_fds(fds, n_fds);
3127 if (r >= 0)
3128 r = shift_fds(fds, n_fds);
3129 if (r >= 0)
3130 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3131 if (r < 0) {
3132 *exit_status = EXIT_FDS;
3133 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3134 }
3135
3136 secure_bits = context->secure_bits;
3137
3138 if (needs_sandboxing) {
3139 uint64_t bset;
3140
3141 for (i = 0; i < _RLIMIT_MAX; i++) {
3142
3143 if (!context->rlimit[i])
3144 continue;
3145
3146 r = setrlimit_closest(i, context->rlimit[i]);
3147 if (r < 0) {
3148 *exit_status = EXIT_LIMITS;
3149 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3150 }
3151 }
3152
3153 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3154 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3155 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3156 *exit_status = EXIT_LIMITS;
3157 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3158 }
3159 }
3160
3161 bset = context->capability_bounding_set;
3162 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3163 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3164 * instead of us doing that */
3165 if (needs_ambient_hack)
3166 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3167 (UINT64_C(1) << CAP_SETUID) |
3168 (UINT64_C(1) << CAP_SETGID);
3169
3170 if (!cap_test_all(bset)) {
3171 r = capability_bounding_set_drop(bset, false);
3172 if (r < 0) {
3173 *exit_status = EXIT_CAPABILITIES;
3174 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3175 }
3176 }
3177
3178 /* This is done before enforce_user, but ambient set
3179 * does not survive over setresuid() if keep_caps is not set. */
3180 if (!needs_ambient_hack &&
3181 context->capability_ambient_set != 0) {
3182 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3183 if (r < 0) {
3184 *exit_status = EXIT_CAPABILITIES;
3185 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3186 }
3187 }
3188 }
3189
3190 if (needs_setuid) {
3191 if (context->user) {
3192 r = enforce_user(context, uid);
3193 if (r < 0) {
3194 *exit_status = EXIT_USER;
3195 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3196 }
3197
3198 if (!needs_ambient_hack &&
3199 context->capability_ambient_set != 0) {
3200
3201 /* Fix the ambient capabilities after user change. */
3202 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3203 if (r < 0) {
3204 *exit_status = EXIT_CAPABILITIES;
3205 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3206 }
3207
3208 /* If we were asked to change user and ambient capabilities
3209 * were requested, we had to add keep-caps to the securebits
3210 * so that we would maintain the inherited capability set
3211 * through the setresuid(). Make sure that the bit is added
3212 * also to the context secure_bits so that we don't try to
3213 * drop the bit away next. */
3214
3215 secure_bits |= 1<<SECURE_KEEP_CAPS;
3216 }
3217 }
3218 }
3219
3220 if (needs_sandboxing) {
3221 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3222 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3223 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3224 * are restricted. */
3225
3226 #if HAVE_SELINUX
3227 if (use_selinux) {
3228 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3229
3230 if (exec_context) {
3231 r = setexeccon(exec_context);
3232 if (r < 0) {
3233 *exit_status = EXIT_SELINUX_CONTEXT;
3234 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3235 }
3236 }
3237 }
3238 #endif
3239
3240 #if HAVE_SMACK
3241 if (use_smack) {
3242 r = setup_smack(context, command);
3243 if (r < 0) {
3244 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3245 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3246 }
3247 }
3248 #endif
3249
3250 #if HAVE_APPARMOR
3251 if (use_apparmor && context->apparmor_profile) {
3252 r = aa_change_onexec(context->apparmor_profile);
3253 if (r < 0 && !context->apparmor_profile_ignore) {
3254 *exit_status = EXIT_APPARMOR_PROFILE;
3255 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3256 }
3257 }
3258 #endif
3259
3260 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3261 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3262 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3263 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3264 *exit_status = EXIT_SECUREBITS;
3265 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3266 }
3267
3268 if (context_has_no_new_privileges(context))
3269 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3270 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3271 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3272 }
3273
3274 #if HAVE_SECCOMP
3275 r = apply_address_families(unit, context);
3276 if (r < 0) {
3277 *exit_status = EXIT_ADDRESS_FAMILIES;
3278 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3279 }
3280
3281 r = apply_memory_deny_write_execute(unit, context);
3282 if (r < 0) {
3283 *exit_status = EXIT_SECCOMP;
3284 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3285 }
3286
3287 r = apply_restrict_realtime(unit, context);
3288 if (r < 0) {
3289 *exit_status = EXIT_SECCOMP;
3290 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3291 }
3292
3293 r = apply_restrict_namespaces(unit, context);
3294 if (r < 0) {
3295 *exit_status = EXIT_SECCOMP;
3296 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3297 }
3298
3299 r = apply_protect_sysctl(unit, context);
3300 if (r < 0) {
3301 *exit_status = EXIT_SECCOMP;
3302 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3303 }
3304
3305 r = apply_protect_kernel_modules(unit, context);
3306 if (r < 0) {
3307 *exit_status = EXIT_SECCOMP;
3308 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3309 }
3310
3311 r = apply_private_devices(unit, context);
3312 if (r < 0) {
3313 *exit_status = EXIT_SECCOMP;
3314 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3315 }
3316
3317 r = apply_syscall_archs(unit, context);
3318 if (r < 0) {
3319 *exit_status = EXIT_SECCOMP;
3320 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3321 }
3322
3323 r = apply_lock_personality(unit, context);
3324 if (r < 0) {
3325 *exit_status = EXIT_SECCOMP;
3326 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3327 }
3328
3329 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3330 * by the filter as little as possible. */
3331 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3332 if (r < 0) {
3333 *exit_status = EXIT_SECCOMP;
3334 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3335 }
3336 #endif
3337 }
3338
3339 if (!strv_isempty(context->unset_environment)) {
3340 char **ee = NULL;
3341
3342 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3343 if (!ee) {
3344 *exit_status = EXIT_MEMORY;
3345 return log_oom();
3346 }
3347
3348 strv_free(accum_env);
3349 accum_env = ee;
3350 }
3351
3352 final_argv = replace_env_argv(argv, accum_env);
3353 if (!final_argv) {
3354 *exit_status = EXIT_MEMORY;
3355 return log_oom();
3356 }
3357
3358 if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3359 _cleanup_free_ char *line;
3360
3361 line = exec_command_line(final_argv);
3362 if (line) {
3363 log_struct(LOG_DEBUG,
3364 "EXECUTABLE=%s", command->path,
3365 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3366 LOG_UNIT_ID(unit),
3367 LOG_UNIT_INVOCATION_ID(unit),
3368 NULL);
3369 }
3370 }
3371
3372 execve(command->path, final_argv, accum_env);
3373
3374 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3375
3376 log_struct_errno(LOG_INFO, errno,
3377 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3378 LOG_UNIT_ID(unit),
3379 LOG_UNIT_INVOCATION_ID(unit),
3380 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3381 command->path),
3382 "EXECUTABLE=%s", command->path,
3383 NULL);
3384
3385 return 0;
3386 }
3387
3388 *exit_status = EXIT_EXEC;
3389 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3390 }
3391
3392 int exec_spawn(Unit *unit,
3393 ExecCommand *command,
3394 const ExecContext *context,
3395 const ExecParameters *params,
3396 ExecRuntime *runtime,
3397 DynamicCreds *dcreds,
3398 pid_t *ret) {
3399
3400 _cleanup_strv_free_ char **files_env = NULL;
3401 int *fds = NULL;
3402 unsigned n_storage_fds = 0, n_socket_fds = 0;
3403 _cleanup_free_ char *line = NULL;
3404 int socket_fd, r;
3405 int named_iofds[3] = { -1, -1, -1 };
3406 char **argv;
3407 pid_t pid;
3408
3409 assert(unit);
3410 assert(command);
3411 assert(context);
3412 assert(ret);
3413 assert(params);
3414 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3415
3416 if (context->std_input == EXEC_INPUT_SOCKET ||
3417 context->std_output == EXEC_OUTPUT_SOCKET ||
3418 context->std_error == EXEC_OUTPUT_SOCKET) {
3419
3420 if (params->n_socket_fds > 1) {
3421 log_unit_error(unit, "Got more than one socket.");
3422 return -EINVAL;
3423 }
3424
3425 if (params->n_socket_fds == 0) {
3426 log_unit_error(unit, "Got no socket.");
3427 return -EINVAL;
3428 }
3429
3430 socket_fd = params->fds[0];
3431 } else {
3432 socket_fd = -1;
3433 fds = params->fds;
3434 n_storage_fds = params->n_storage_fds;
3435 n_socket_fds = params->n_socket_fds;
3436 }
3437
3438 r = exec_context_named_iofds(unit, context, params, named_iofds);
3439 if (r < 0)
3440 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3441
3442 r = exec_context_load_environment(unit, context, &files_env);
3443 if (r < 0)
3444 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3445
3446 argv = params->argv ?: command->argv;
3447 line = exec_command_line(argv);
3448 if (!line)
3449 return log_oom();
3450
3451 log_struct(LOG_DEBUG,
3452 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3453 "EXECUTABLE=%s", command->path,
3454 LOG_UNIT_ID(unit),
3455 LOG_UNIT_INVOCATION_ID(unit),
3456 NULL);
3457
3458 pid = fork();
3459 if (pid < 0)
3460 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3461
3462 if (pid == 0) {
3463 int exit_status = EXIT_SUCCESS;
3464
3465 r = exec_child(unit,
3466 command,
3467 context,
3468 params,
3469 runtime,
3470 dcreds,
3471 argv,
3472 socket_fd,
3473 named_iofds,
3474 fds,
3475 n_storage_fds,
3476 n_socket_fds,
3477 files_env,
3478 unit->manager->user_lookup_fds[1],
3479 &exit_status);
3480
3481 if (r < 0) {
3482 log_struct_errno(LOG_ERR, r,
3483 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3484 LOG_UNIT_ID(unit),
3485 LOG_UNIT_INVOCATION_ID(unit),
3486 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3487 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3488 command->path),
3489 "EXECUTABLE=%s", command->path,
3490 NULL);
3491 }
3492
3493 _exit(exit_status);
3494 }
3495
3496 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3497
3498 /* We add the new process to the cgroup both in the child (so
3499 * that we can be sure that no user code is ever executed
3500 * outside of the cgroup) and in the parent (so that we can be
3501 * sure that when we kill the cgroup the process will be
3502 * killed too). */
3503 if (params->cgroup_path)
3504 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3505
3506 exec_status_start(&command->exec_status, pid);
3507
3508 *ret = pid;
3509 return 0;
3510 }
3511
3512 void exec_context_init(ExecContext *c) {
3513 ExecDirectoryType i;
3514
3515 assert(c);
3516
3517 c->umask = 0022;
3518 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3519 c->cpu_sched_policy = SCHED_OTHER;
3520 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3521 c->syslog_level_prefix = true;
3522 c->ignore_sigpipe = true;
3523 c->timer_slack_nsec = NSEC_INFINITY;
3524 c->personality = PERSONALITY_INVALID;
3525 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3526 c->directories[i].mode = 0755;
3527 c->capability_bounding_set = CAP_ALL;
3528 c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3529 }
3530
3531 void exec_context_done(ExecContext *c) {
3532 unsigned l;
3533 ExecDirectoryType i;
3534
3535 assert(c);
3536
3537 c->environment = strv_free(c->environment);
3538 c->environment_files = strv_free(c->environment_files);
3539 c->pass_environment = strv_free(c->pass_environment);
3540 c->unset_environment = strv_free(c->unset_environment);
3541
3542 for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3543 c->rlimit[l] = mfree(c->rlimit[l]);
3544
3545 for (l = 0; l < 3; l++)
3546 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3547
3548 c->working_directory = mfree(c->working_directory);
3549 c->root_directory = mfree(c->root_directory);
3550 c->root_image = mfree(c->root_image);
3551 c->tty_path = mfree(c->tty_path);
3552 c->syslog_identifier = mfree(c->syslog_identifier);
3553 c->user = mfree(c->user);
3554 c->group = mfree(c->group);
3555
3556 c->supplementary_groups = strv_free(c->supplementary_groups);
3557
3558 c->pam_name = mfree(c->pam_name);
3559
3560 c->read_only_paths = strv_free(c->read_only_paths);
3561 c->read_write_paths = strv_free(c->read_write_paths);
3562 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3563
3564 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3565
3566 if (c->cpuset)
3567 CPU_FREE(c->cpuset);
3568
3569 c->utmp_id = mfree(c->utmp_id);
3570 c->selinux_context = mfree(c->selinux_context);
3571 c->apparmor_profile = mfree(c->apparmor_profile);
3572 c->smack_process_label = mfree(c->smack_process_label);
3573
3574 c->syscall_filter = set_free(c->syscall_filter);
3575 c->syscall_archs = set_free(c->syscall_archs);
3576 c->address_families = set_free(c->address_families);
3577
3578 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3579 c->directories[i].paths = strv_free(c->directories[i].paths);
3580 }
3581
3582 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3583 char **i;
3584
3585 assert(c);
3586
3587 if (!runtime_prefix)
3588 return 0;
3589
3590 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3591 _cleanup_free_ char *p;
3592
3593 p = strjoin(runtime_prefix, "/", *i);
3594 if (!p)
3595 return -ENOMEM;
3596
3597 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3598 * next. */
3599 (void) rm_rf(p, REMOVE_ROOT);
3600
3601 /* Also destroy any matching subdirectory below /private/. This is done to support DynamicUser=1
3602 * setups. Note that we don't conditionalize here on that though, as the namespace is same way, and it
3603 * makes us a bit more robust towards changing unit settings. Or to say this differently: in the worst
3604 * case this is a NOP. */
3605
3606 free(p);
3607 p = strjoin(runtime_prefix, "/private/", *i);
3608 if (!p)
3609 return -ENOMEM;
3610
3611 (void) rm_rf(p, REMOVE_ROOT);
3612 }
3613
3614 return 0;
3615 }
3616
3617 void exec_command_done(ExecCommand *c) {
3618 assert(c);
3619
3620 c->path = mfree(c->path);
3621
3622 c->argv = strv_free(c->argv);
3623 }
3624
3625 void exec_command_done_array(ExecCommand *c, unsigned n) {
3626 unsigned i;
3627
3628 for (i = 0; i < n; i++)
3629 exec_command_done(c+i);
3630 }
3631
3632 ExecCommand* exec_command_free_list(ExecCommand *c) {
3633 ExecCommand *i;
3634
3635 while ((i = c)) {
3636 LIST_REMOVE(command, c, i);
3637 exec_command_done(i);
3638 free(i);
3639 }
3640
3641 return NULL;
3642 }
3643
3644 void exec_command_free_array(ExecCommand **c, unsigned n) {
3645 unsigned i;
3646
3647 for (i = 0; i < n; i++)
3648 c[i] = exec_command_free_list(c[i]);
3649 }
3650
3651 typedef struct InvalidEnvInfo {
3652 Unit *unit;
3653 const char *path;
3654 } InvalidEnvInfo;
3655
3656 static void invalid_env(const char *p, void *userdata) {
3657 InvalidEnvInfo *info = userdata;
3658
3659 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3660 }
3661
3662 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3663 assert(c);
3664
3665 switch (fd_index) {
3666 case STDIN_FILENO:
3667 if (c->std_input != EXEC_INPUT_NAMED_FD)
3668 return NULL;
3669 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3670 case STDOUT_FILENO:
3671 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3672 return NULL;
3673 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3674 case STDERR_FILENO:
3675 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3676 return NULL;
3677 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3678 default:
3679 return NULL;
3680 }
3681 }
3682
3683 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3684 unsigned i, targets;
3685 const char* stdio_fdname[3];
3686 unsigned n_fds;
3687
3688 assert(c);
3689 assert(p);
3690
3691 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3692 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3693 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3694
3695 for (i = 0; i < 3; i++)
3696 stdio_fdname[i] = exec_context_fdname(c, i);
3697
3698 n_fds = p->n_storage_fds + p->n_socket_fds;
3699
3700 for (i = 0; i < n_fds && targets > 0; i++)
3701 if (named_iofds[STDIN_FILENO] < 0 &&
3702 c->std_input == EXEC_INPUT_NAMED_FD &&
3703 stdio_fdname[STDIN_FILENO] &&
3704 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3705
3706 named_iofds[STDIN_FILENO] = p->fds[i];
3707 targets--;
3708
3709 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3710 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3711 stdio_fdname[STDOUT_FILENO] &&
3712 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3713
3714 named_iofds[STDOUT_FILENO] = p->fds[i];
3715 targets--;
3716
3717 } else if (named_iofds[STDERR_FILENO] < 0 &&
3718 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3719 stdio_fdname[STDERR_FILENO] &&
3720 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3721
3722 named_iofds[STDERR_FILENO] = p->fds[i];
3723 targets--;
3724 }
3725
3726 return targets == 0 ? 0 : -ENOENT;
3727 }
3728
3729 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3730 char **i, **r = NULL;
3731
3732 assert(c);
3733 assert(l);
3734
3735 STRV_FOREACH(i, c->environment_files) {
3736 char *fn;
3737 int k;
3738 unsigned n;
3739 bool ignore = false;
3740 char **p;
3741 _cleanup_globfree_ glob_t pglob = {};
3742
3743 fn = *i;
3744
3745 if (fn[0] == '-') {
3746 ignore = true;
3747 fn++;
3748 }
3749
3750 if (!path_is_absolute(fn)) {
3751 if (ignore)
3752 continue;
3753
3754 strv_free(r);
3755 return -EINVAL;
3756 }
3757
3758 /* Filename supports globbing, take all matching files */
3759 k = safe_glob(fn, 0, &pglob);
3760 if (k < 0) {
3761 if (ignore)
3762 continue;
3763
3764 strv_free(r);
3765 return k;
3766 }
3767
3768 /* When we don't match anything, -ENOENT should be returned */
3769 assert(pglob.gl_pathc > 0);
3770
3771 for (n = 0; n < pglob.gl_pathc; n++) {
3772 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3773 if (k < 0) {
3774 if (ignore)
3775 continue;
3776
3777 strv_free(r);
3778 return k;
3779 }
3780 /* Log invalid environment variables with filename */
3781 if (p) {
3782 InvalidEnvInfo info = {
3783 .unit = unit,
3784 .path = pglob.gl_pathv[n]
3785 };
3786
3787 p = strv_env_clean_with_callback(p, invalid_env, &info);
3788 }
3789
3790 if (r == NULL)
3791 r = p;
3792 else {
3793 char **m;
3794
3795 m = strv_env_merge(2, r, p);
3796 strv_free(r);
3797 strv_free(p);
3798 if (!m)
3799 return -ENOMEM;
3800
3801 r = m;
3802 }
3803 }
3804 }
3805
3806 *l = r;
3807
3808 return 0;
3809 }
3810
3811 static bool tty_may_match_dev_console(const char *tty) {
3812 _cleanup_free_ char *active = NULL;
3813 char *console;
3814
3815 if (!tty)
3816 return true;
3817
3818 tty = skip_dev_prefix(tty);
3819
3820 /* trivial identity? */
3821 if (streq(tty, "console"))
3822 return true;
3823
3824 console = resolve_dev_console(&active);
3825 /* if we could not resolve, assume it may */
3826 if (!console)
3827 return true;
3828
3829 /* "tty0" means the active VC, so it may be the same sometimes */
3830 return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3831 }
3832
3833 bool exec_context_may_touch_console(ExecContext *ec) {
3834
3835 return (ec->tty_reset ||
3836 ec->tty_vhangup ||
3837 ec->tty_vt_disallocate ||
3838 is_terminal_input(ec->std_input) ||
3839 is_terminal_output(ec->std_output) ||
3840 is_terminal_output(ec->std_error)) &&
3841 tty_may_match_dev_console(exec_context_tty_path(ec));
3842 }
3843
3844 static void strv_fprintf(FILE *f, char **l) {
3845 char **g;
3846
3847 assert(f);
3848
3849 STRV_FOREACH(g, l)
3850 fprintf(f, " %s", *g);
3851 }
3852
3853 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3854 char **e, **d;
3855 unsigned i;
3856 ExecDirectoryType dt;
3857 int r;
3858
3859 assert(c);
3860 assert(f);
3861
3862 prefix = strempty(prefix);
3863
3864 fprintf(f,
3865 "%sUMask: %04o\n"
3866 "%sWorkingDirectory: %s\n"
3867 "%sRootDirectory: %s\n"
3868 "%sNonBlocking: %s\n"
3869 "%sPrivateTmp: %s\n"
3870 "%sPrivateDevices: %s\n"
3871 "%sProtectKernelTunables: %s\n"
3872 "%sProtectKernelModules: %s\n"
3873 "%sProtectControlGroups: %s\n"
3874 "%sPrivateNetwork: %s\n"
3875 "%sPrivateUsers: %s\n"
3876 "%sProtectHome: %s\n"
3877 "%sProtectSystem: %s\n"
3878 "%sMountAPIVFS: %s\n"
3879 "%sIgnoreSIGPIPE: %s\n"
3880 "%sMemoryDenyWriteExecute: %s\n"
3881 "%sRestrictRealtime: %s\n"
3882 "%sKeyringMode: %s\n",
3883 prefix, c->umask,
3884 prefix, c->working_directory ? c->working_directory : "/",
3885 prefix, c->root_directory ? c->root_directory : "/",
3886 prefix, yes_no(c->non_blocking),
3887 prefix, yes_no(c->private_tmp),
3888 prefix, yes_no(c->private_devices),
3889 prefix, yes_no(c->protect_kernel_tunables),
3890 prefix, yes_no(c->protect_kernel_modules),
3891 prefix, yes_no(c->protect_control_groups),
3892 prefix, yes_no(c->private_network),
3893 prefix, yes_no(c->private_users),
3894 prefix, protect_home_to_string(c->protect_home),
3895 prefix, protect_system_to_string(c->protect_system),
3896 prefix, yes_no(c->mount_apivfs),
3897 prefix, yes_no(c->ignore_sigpipe),
3898 prefix, yes_no(c->memory_deny_write_execute),
3899 prefix, yes_no(c->restrict_realtime),
3900 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3901
3902 if (c->root_image)
3903 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3904
3905 STRV_FOREACH(e, c->environment)
3906 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3907
3908 STRV_FOREACH(e, c->environment_files)
3909 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3910
3911 STRV_FOREACH(e, c->pass_environment)
3912 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3913
3914 STRV_FOREACH(e, c->unset_environment)
3915 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3916
3917 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3918
3919 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3920 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3921
3922 STRV_FOREACH(d, c->directories[dt].paths)
3923 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3924 }
3925
3926 if (c->nice_set)
3927 fprintf(f,
3928 "%sNice: %i\n",
3929 prefix, c->nice);
3930
3931 if (c->oom_score_adjust_set)
3932 fprintf(f,
3933 "%sOOMScoreAdjust: %i\n",
3934 prefix, c->oom_score_adjust);
3935
3936 for (i = 0; i < RLIM_NLIMITS; i++)
3937 if (c->rlimit[i]) {
3938 fprintf(f, "%s%s: " RLIM_FMT "\n",
3939 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3940 fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3941 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3942 }
3943
3944 if (c->ioprio_set) {
3945 _cleanup_free_ char *class_str = NULL;
3946
3947 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3948 if (r >= 0)
3949 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3950
3951 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3952 }
3953
3954 if (c->cpu_sched_set) {
3955 _cleanup_free_ char *policy_str = NULL;
3956
3957 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3958 if (r >= 0)
3959 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3960
3961 fprintf(f,
3962 "%sCPUSchedulingPriority: %i\n"
3963 "%sCPUSchedulingResetOnFork: %s\n",
3964 prefix, c->cpu_sched_priority,
3965 prefix, yes_no(c->cpu_sched_reset_on_fork));
3966 }
3967
3968 if (c->cpuset) {
3969 fprintf(f, "%sCPUAffinity:", prefix);
3970 for (i = 0; i < c->cpuset_ncpus; i++)
3971 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
3972 fprintf(f, " %u", i);
3973 fputs("\n", f);
3974 }
3975
3976 if (c->timer_slack_nsec != NSEC_INFINITY)
3977 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
3978
3979 fprintf(f,
3980 "%sStandardInput: %s\n"
3981 "%sStandardOutput: %s\n"
3982 "%sStandardError: %s\n",
3983 prefix, exec_input_to_string(c->std_input),
3984 prefix, exec_output_to_string(c->std_output),
3985 prefix, exec_output_to_string(c->std_error));
3986
3987 if (c->tty_path)
3988 fprintf(f,
3989 "%sTTYPath: %s\n"
3990 "%sTTYReset: %s\n"
3991 "%sTTYVHangup: %s\n"
3992 "%sTTYVTDisallocate: %s\n",
3993 prefix, c->tty_path,
3994 prefix, yes_no(c->tty_reset),
3995 prefix, yes_no(c->tty_vhangup),
3996 prefix, yes_no(c->tty_vt_disallocate));
3997
3998 if (IN_SET(c->std_output,
3999 EXEC_OUTPUT_SYSLOG,
4000 EXEC_OUTPUT_KMSG,
4001 EXEC_OUTPUT_JOURNAL,
4002 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4003 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4004 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4005 IN_SET(c->std_error,
4006 EXEC_OUTPUT_SYSLOG,
4007 EXEC_OUTPUT_KMSG,
4008 EXEC_OUTPUT_JOURNAL,
4009 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4010 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4011 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4012
4013 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4014
4015 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4016 if (r >= 0)
4017 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4018
4019 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4020 if (r >= 0)
4021 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4022 }
4023
4024 if (c->secure_bits) {
4025 _cleanup_free_ char *str = NULL;
4026
4027 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4028 if (r >= 0)
4029 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4030 }
4031
4032 if (c->capability_bounding_set != CAP_ALL) {
4033 _cleanup_free_ char *str = NULL;
4034
4035 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4036 if (r >= 0)
4037 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4038 }
4039
4040 if (c->capability_ambient_set != 0) {
4041 _cleanup_free_ char *str = NULL;
4042
4043 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4044 if (r >= 0)
4045 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4046 }
4047
4048 if (c->user)
4049 fprintf(f, "%sUser: %s\n", prefix, c->user);
4050 if (c->group)
4051 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4052
4053 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4054
4055 if (strv_length(c->supplementary_groups) > 0) {
4056 fprintf(f, "%sSupplementaryGroups:", prefix);
4057 strv_fprintf(f, c->supplementary_groups);
4058 fputs("\n", f);
4059 }
4060
4061 if (c->pam_name)
4062 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4063
4064 if (strv_length(c->read_write_paths) > 0) {
4065 fprintf(f, "%sReadWritePaths:", prefix);
4066 strv_fprintf(f, c->read_write_paths);
4067 fputs("\n", f);
4068 }
4069
4070 if (strv_length(c->read_only_paths) > 0) {
4071 fprintf(f, "%sReadOnlyPaths:", prefix);
4072 strv_fprintf(f, c->read_only_paths);
4073 fputs("\n", f);
4074 }
4075
4076 if (strv_length(c->inaccessible_paths) > 0) {
4077 fprintf(f, "%sInaccessiblePaths:", prefix);
4078 strv_fprintf(f, c->inaccessible_paths);
4079 fputs("\n", f);
4080 }
4081
4082 if (c->n_bind_mounts > 0)
4083 for (i = 0; i < c->n_bind_mounts; i++) {
4084 fprintf(f, "%s%s: %s:%s:%s\n", prefix,
4085 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4086 c->bind_mounts[i].source,
4087 c->bind_mounts[i].destination,
4088 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4089 }
4090
4091 if (c->utmp_id)
4092 fprintf(f,
4093 "%sUtmpIdentifier: %s\n",
4094 prefix, c->utmp_id);
4095
4096 if (c->selinux_context)
4097 fprintf(f,
4098 "%sSELinuxContext: %s%s\n",
4099 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4100
4101 if (c->apparmor_profile)
4102 fprintf(f,
4103 "%sAppArmorProfile: %s%s\n",
4104 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4105
4106 if (c->smack_process_label)
4107 fprintf(f,
4108 "%sSmackProcessLabel: %s%s\n",
4109 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4110
4111 if (c->personality != PERSONALITY_INVALID)
4112 fprintf(f,
4113 "%sPersonality: %s\n",
4114 prefix, strna(personality_to_string(c->personality)));
4115
4116 fprintf(f,
4117 "%sLockPersonality: %s\n",
4118 prefix, yes_no(c->lock_personality));
4119
4120 if (c->syscall_filter) {
4121 #if HAVE_SECCOMP
4122 Iterator j;
4123 void *id;
4124 bool first = true;
4125 #endif
4126
4127 fprintf(f,
4128 "%sSystemCallFilter: ",
4129 prefix);
4130
4131 if (!c->syscall_whitelist)
4132 fputc('~', f);
4133
4134 #if HAVE_SECCOMP
4135 SET_FOREACH(id, c->syscall_filter, j) {
4136 _cleanup_free_ char *name = NULL;
4137
4138 if (first)
4139 first = false;
4140 else
4141 fputc(' ', f);
4142
4143 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4144 fputs(strna(name), f);
4145 }
4146 #endif
4147
4148 fputc('\n', f);
4149 }
4150
4151 if (c->syscall_archs) {
4152 #if HAVE_SECCOMP
4153 Iterator j;
4154 void *id;
4155 #endif
4156
4157 fprintf(f,
4158 "%sSystemCallArchitectures:",
4159 prefix);
4160
4161 #if HAVE_SECCOMP
4162 SET_FOREACH(id, c->syscall_archs, j)
4163 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4164 #endif
4165 fputc('\n', f);
4166 }
4167
4168 if (exec_context_restrict_namespaces_set(c)) {
4169 _cleanup_free_ char *s = NULL;
4170
4171 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4172 if (r >= 0)
4173 fprintf(f, "%sRestrictNamespaces: %s\n",
4174 prefix, s);
4175 }
4176
4177 if (c->syscall_errno > 0)
4178 fprintf(f,
4179 "%sSystemCallErrorNumber: %s\n",
4180 prefix, strna(errno_to_name(c->syscall_errno)));
4181
4182 if (c->apparmor_profile)
4183 fprintf(f,
4184 "%sAppArmorProfile: %s%s\n",
4185 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4186 }
4187
4188 bool exec_context_maintains_privileges(ExecContext *c) {
4189 assert(c);
4190
4191 /* Returns true if the process forked off would run under
4192 * an unchanged UID or as root. */
4193
4194 if (!c->user)
4195 return true;
4196
4197 if (streq(c->user, "root") || streq(c->user, "0"))
4198 return true;
4199
4200 return false;
4201 }
4202
4203 int exec_context_get_effective_ioprio(ExecContext *c) {
4204 int p;
4205
4206 assert(c);
4207
4208 if (c->ioprio_set)
4209 return c->ioprio;
4210
4211 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4212 if (p < 0)
4213 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4214
4215 return p;
4216 }
4217
4218 void exec_status_start(ExecStatus *s, pid_t pid) {
4219 assert(s);
4220
4221 zero(*s);
4222 s->pid = pid;
4223 dual_timestamp_get(&s->start_timestamp);
4224 }
4225
4226 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4227 assert(s);
4228
4229 if (s->pid && s->pid != pid)
4230 zero(*s);
4231
4232 s->pid = pid;
4233 dual_timestamp_get(&s->exit_timestamp);
4234
4235 s->code = code;
4236 s->status = status;
4237
4238 if (context) {
4239 if (context->utmp_id)
4240 utmp_put_dead_process(context->utmp_id, pid, code, status);
4241
4242 exec_context_tty_reset(context, NULL);
4243 }
4244 }
4245
4246 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4247 char buf[FORMAT_TIMESTAMP_MAX];
4248
4249 assert(s);
4250 assert(f);
4251
4252 if (s->pid <= 0)
4253 return;
4254
4255 prefix = strempty(prefix);
4256
4257 fprintf(f,
4258 "%sPID: "PID_FMT"\n",
4259 prefix, s->pid);
4260
4261 if (dual_timestamp_is_set(&s->start_timestamp))
4262 fprintf(f,
4263 "%sStart Timestamp: %s\n",
4264 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4265
4266 if (dual_timestamp_is_set(&s->exit_timestamp))
4267 fprintf(f,
4268 "%sExit Timestamp: %s\n"
4269 "%sExit Code: %s\n"
4270 "%sExit Status: %i\n",
4271 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4272 prefix, sigchld_code_to_string(s->code),
4273 prefix, s->status);
4274 }
4275
4276 char *exec_command_line(char **argv) {
4277 size_t k;
4278 char *n, *p, **a;
4279 bool first = true;
4280
4281 assert(argv);
4282
4283 k = 1;
4284 STRV_FOREACH(a, argv)
4285 k += strlen(*a)+3;
4286
4287 n = new(char, k);
4288 if (!n)
4289 return NULL;
4290
4291 p = n;
4292 STRV_FOREACH(a, argv) {
4293
4294 if (!first)
4295 *(p++) = ' ';
4296 else
4297 first = false;
4298
4299 if (strpbrk(*a, WHITESPACE)) {
4300 *(p++) = '\'';
4301 p = stpcpy(p, *a);
4302 *(p++) = '\'';
4303 } else
4304 p = stpcpy(p, *a);
4305
4306 }
4307
4308 *p = 0;
4309
4310 /* FIXME: this doesn't really handle arguments that have
4311 * spaces and ticks in them */
4312
4313 return n;
4314 }
4315
4316 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4317 _cleanup_free_ char *cmd = NULL;
4318 const char *prefix2;
4319
4320 assert(c);
4321 assert(f);
4322
4323 prefix = strempty(prefix);
4324 prefix2 = strjoina(prefix, "\t");
4325
4326 cmd = exec_command_line(c->argv);
4327 fprintf(f,
4328 "%sCommand Line: %s\n",
4329 prefix, cmd ? cmd : strerror(ENOMEM));
4330
4331 exec_status_dump(&c->exec_status, f, prefix2);
4332 }
4333
4334 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4335 assert(f);
4336
4337 prefix = strempty(prefix);
4338
4339 LIST_FOREACH(command, c, c)
4340 exec_command_dump(c, f, prefix);
4341 }
4342
4343 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4344 ExecCommand *end;
4345
4346 assert(l);
4347 assert(e);
4348
4349 if (*l) {
4350 /* It's kind of important, that we keep the order here */
4351 LIST_FIND_TAIL(command, *l, end);
4352 LIST_INSERT_AFTER(command, *l, end, e);
4353 } else
4354 *l = e;
4355 }
4356
4357 int exec_command_set(ExecCommand *c, const char *path, ...) {
4358 va_list ap;
4359 char **l, *p;
4360
4361 assert(c);
4362 assert(path);
4363
4364 va_start(ap, path);
4365 l = strv_new_ap(path, ap);
4366 va_end(ap);
4367
4368 if (!l)
4369 return -ENOMEM;
4370
4371 p = strdup(path);
4372 if (!p) {
4373 strv_free(l);
4374 return -ENOMEM;
4375 }
4376
4377 free(c->path);
4378 c->path = p;
4379
4380 strv_free(c->argv);
4381 c->argv = l;
4382
4383 return 0;
4384 }
4385
4386 int exec_command_append(ExecCommand *c, const char *path, ...) {
4387 _cleanup_strv_free_ char **l = NULL;
4388 va_list ap;
4389 int r;
4390
4391 assert(c);
4392 assert(path);
4393
4394 va_start(ap, path);
4395 l = strv_new_ap(path, ap);
4396 va_end(ap);
4397
4398 if (!l)
4399 return -ENOMEM;
4400
4401 r = strv_extend_strv(&c->argv, l, false);
4402 if (r < 0)
4403 return r;
4404
4405 return 0;
4406 }
4407
4408
4409 static int exec_runtime_allocate(ExecRuntime **rt) {
4410
4411 if (*rt)
4412 return 0;
4413
4414 *rt = new0(ExecRuntime, 1);
4415 if (!*rt)
4416 return -ENOMEM;
4417
4418 (*rt)->n_ref = 1;
4419 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4420
4421 return 0;
4422 }
4423
4424 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4425 int r;
4426
4427 assert(rt);
4428 assert(c);
4429 assert(id);
4430
4431 if (*rt)
4432 return 1;
4433
4434 if (!c->private_network && !c->private_tmp)
4435 return 0;
4436
4437 r = exec_runtime_allocate(rt);
4438 if (r < 0)
4439 return r;
4440
4441 if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4442 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4443 return -errno;
4444 }
4445
4446 if (c->private_tmp && !(*rt)->tmp_dir) {
4447 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4448 if (r < 0)
4449 return r;
4450 }
4451
4452 return 1;
4453 }
4454
4455 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4456 assert(r);
4457 assert(r->n_ref > 0);
4458
4459 r->n_ref++;
4460 return r;
4461 }
4462
4463 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4464
4465 if (!r)
4466 return NULL;
4467
4468 assert(r->n_ref > 0);
4469
4470 r->n_ref--;
4471 if (r->n_ref > 0)
4472 return NULL;
4473
4474 free(r->tmp_dir);
4475 free(r->var_tmp_dir);
4476 safe_close_pair(r->netns_storage_socket);
4477 return mfree(r);
4478 }
4479
4480 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4481 assert(u);
4482 assert(f);
4483 assert(fds);
4484
4485 if (!rt)
4486 return 0;
4487
4488 if (rt->tmp_dir)
4489 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4490
4491 if (rt->var_tmp_dir)
4492 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4493
4494 if (rt->netns_storage_socket[0] >= 0) {
4495 int copy;
4496
4497 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4498 if (copy < 0)
4499 return copy;
4500
4501 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4502 }
4503
4504 if (rt->netns_storage_socket[1] >= 0) {
4505 int copy;
4506
4507 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4508 if (copy < 0)
4509 return copy;
4510
4511 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4512 }
4513
4514 return 0;
4515 }
4516
4517 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4518 int r;
4519
4520 assert(rt);
4521 assert(key);
4522 assert(value);
4523
4524 if (streq(key, "tmp-dir")) {
4525 char *copy;
4526
4527 r = exec_runtime_allocate(rt);
4528 if (r < 0)
4529 return log_oom();
4530
4531 copy = strdup(value);
4532 if (!copy)
4533 return log_oom();
4534
4535 free((*rt)->tmp_dir);
4536 (*rt)->tmp_dir = copy;
4537
4538 } else if (streq(key, "var-tmp-dir")) {
4539 char *copy;
4540
4541 r = exec_runtime_allocate(rt);
4542 if (r < 0)
4543 return log_oom();
4544
4545 copy = strdup(value);
4546 if (!copy)
4547 return log_oom();
4548
4549 free((*rt)->var_tmp_dir);
4550 (*rt)->var_tmp_dir = copy;
4551
4552 } else if (streq(key, "netns-socket-0")) {
4553 int fd;
4554
4555 r = exec_runtime_allocate(rt);
4556 if (r < 0)
4557 return log_oom();
4558
4559 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4560 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4561 else {
4562 safe_close((*rt)->netns_storage_socket[0]);
4563 (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4564 }
4565 } else if (streq(key, "netns-socket-1")) {
4566 int fd;
4567
4568 r = exec_runtime_allocate(rt);
4569 if (r < 0)
4570 return log_oom();
4571
4572 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4573 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4574 else {
4575 safe_close((*rt)->netns_storage_socket[1]);
4576 (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4577 }
4578 } else
4579 return 0;
4580
4581 return 1;
4582 }
4583
4584 static void *remove_tmpdir_thread(void *p) {
4585 _cleanup_free_ char *path = p;
4586
4587 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4588 return NULL;
4589 }
4590
4591 void exec_runtime_destroy(ExecRuntime *rt) {
4592 int r;
4593
4594 if (!rt)
4595 return;
4596
4597 /* If there are multiple users of this, let's leave the stuff around */
4598 if (rt->n_ref > 1)
4599 return;
4600
4601 if (rt->tmp_dir) {
4602 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4603
4604 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4605 if (r < 0) {
4606 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4607 free(rt->tmp_dir);
4608 }
4609
4610 rt->tmp_dir = NULL;
4611 }
4612
4613 if (rt->var_tmp_dir) {
4614 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4615
4616 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4617 if (r < 0) {
4618 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4619 free(rt->var_tmp_dir);
4620 }
4621
4622 rt->var_tmp_dir = NULL;
4623 }
4624
4625 safe_close_pair(rt->netns_storage_socket);
4626 }
4627
4628 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4629 [EXEC_INPUT_NULL] = "null",
4630 [EXEC_INPUT_TTY] = "tty",
4631 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4632 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4633 [EXEC_INPUT_SOCKET] = "socket",
4634 [EXEC_INPUT_NAMED_FD] = "fd",
4635 };
4636
4637 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4638
4639 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4640 [EXEC_OUTPUT_INHERIT] = "inherit",
4641 [EXEC_OUTPUT_NULL] = "null",
4642 [EXEC_OUTPUT_TTY] = "tty",
4643 [EXEC_OUTPUT_SYSLOG] = "syslog",
4644 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4645 [EXEC_OUTPUT_KMSG] = "kmsg",
4646 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4647 [EXEC_OUTPUT_JOURNAL] = "journal",
4648 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4649 [EXEC_OUTPUT_SOCKET] = "socket",
4650 [EXEC_OUTPUT_NAMED_FD] = "fd",
4651 };
4652
4653 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4654
4655 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4656 [EXEC_UTMP_INIT] = "init",
4657 [EXEC_UTMP_LOGIN] = "login",
4658 [EXEC_UTMP_USER] = "user",
4659 };
4660
4661 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4662
4663 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4664 [EXEC_PRESERVE_NO] = "no",
4665 [EXEC_PRESERVE_YES] = "yes",
4666 [EXEC_PRESERVE_RESTART] = "restart",
4667 };
4668
4669 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4670
4671 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4672 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4673 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4674 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4675 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4676 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4677 };
4678
4679 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4680
4681 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4682 [EXEC_KEYRING_INHERIT] = "inherit",
4683 [EXEC_KEYRING_PRIVATE] = "private",
4684 [EXEC_KEYRING_SHARED] = "shared",
4685 };
4686
4687 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);