]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Add SPDX license identifiers to source files under the LGPL
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <glob.h>
24 #include <grp.h>
25 #include <poll.h>
26 #include <signal.h>
27 #include <string.h>
28 #include <sys/capability.h>
29 #include <sys/eventfd.h>
30 #include <sys/mman.h>
31 #include <sys/personality.h>
32 #include <sys/prctl.h>
33 #include <sys/shm.h>
34 #include <sys/socket.h>
35 #include <sys/stat.h>
36 #include <sys/types.h>
37 #include <sys/un.h>
38 #include <unistd.h>
39 #include <utmpx.h>
40
41 #if HAVE_PAM
42 #include <security/pam_appl.h>
43 #endif
44
45 #if HAVE_SELINUX
46 #include <selinux/selinux.h>
47 #endif
48
49 #if HAVE_SECCOMP
50 #include <seccomp.h>
51 #endif
52
53 #if HAVE_APPARMOR
54 #include <sys/apparmor.h>
55 #endif
56
57 #include "sd-messages.h"
58
59 #include "af-list.h"
60 #include "alloc-util.h"
61 #if HAVE_APPARMOR
62 #include "apparmor-util.h"
63 #endif
64 #include "async.h"
65 #include "barrier.h"
66 #include "cap-list.h"
67 #include "capability-util.h"
68 #include "chown-recursive.h"
69 #include "def.h"
70 #include "env-util.h"
71 #include "errno-list.h"
72 #include "execute.h"
73 #include "exit-status.h"
74 #include "fd-util.h"
75 #include "fileio.h"
76 #include "format-util.h"
77 #include "fs-util.h"
78 #include "glob-util.h"
79 #include "io-util.h"
80 #include "ioprio.h"
81 #include "label.h"
82 #include "log.h"
83 #include "macro.h"
84 #include "missing.h"
85 #include "mkdir.h"
86 #include "namespace.h"
87 #include "parse-util.h"
88 #include "path-util.h"
89 #include "process-util.h"
90 #include "rlimit-util.h"
91 #include "rm-rf.h"
92 #if HAVE_SECCOMP
93 #include "seccomp-util.h"
94 #endif
95 #include "securebits.h"
96 #include "securebits-util.h"
97 #include "selinux-util.h"
98 #include "signal-util.h"
99 #include "smack-util.h"
100 #include "special.h"
101 #include "string-table.h"
102 #include "string-util.h"
103 #include "strv.h"
104 #include "syslog-util.h"
105 #include "terminal-util.h"
106 #include "unit.h"
107 #include "user-util.h"
108 #include "util.h"
109 #include "utmp-wtmp.h"
110
111 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
112 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
113
114 /* This assumes there is a 'tty' group */
115 #define TTY_MODE 0620
116
117 #define SNDBUF_SIZE (8*1024*1024)
118
119 static int shift_fds(int fds[], unsigned n_fds) {
120 int start, restart_from;
121
122 if (n_fds <= 0)
123 return 0;
124
125 /* Modifies the fds array! (sorts it) */
126
127 assert(fds);
128
129 start = 0;
130 for (;;) {
131 int i;
132
133 restart_from = -1;
134
135 for (i = start; i < (int) n_fds; i++) {
136 int nfd;
137
138 /* Already at right index? */
139 if (fds[i] == i+3)
140 continue;
141
142 nfd = fcntl(fds[i], F_DUPFD, i + 3);
143 if (nfd < 0)
144 return -errno;
145
146 safe_close(fds[i]);
147 fds[i] = nfd;
148
149 /* Hmm, the fd we wanted isn't free? Then
150 * let's remember that and try again from here */
151 if (nfd != i+3 && restart_from < 0)
152 restart_from = i;
153 }
154
155 if (restart_from < 0)
156 break;
157
158 start = restart_from;
159 }
160
161 return 0;
162 }
163
164 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
165 unsigned i, n_fds;
166 int r;
167
168 n_fds = n_storage_fds + n_socket_fds;
169 if (n_fds <= 0)
170 return 0;
171
172 assert(fds);
173
174 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
175 * O_NONBLOCK only applies to socket activation though. */
176
177 for (i = 0; i < n_fds; i++) {
178
179 if (i < n_socket_fds) {
180 r = fd_nonblock(fds[i], nonblock);
181 if (r < 0)
182 return r;
183 }
184
185 /* We unconditionally drop FD_CLOEXEC from the fds,
186 * since after all we want to pass these fds to our
187 * children */
188
189 r = fd_cloexec(fds[i], false);
190 if (r < 0)
191 return r;
192 }
193
194 return 0;
195 }
196
197 static const char *exec_context_tty_path(const ExecContext *context) {
198 assert(context);
199
200 if (context->stdio_as_fds)
201 return NULL;
202
203 if (context->tty_path)
204 return context->tty_path;
205
206 return "/dev/console";
207 }
208
209 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
210 const char *path;
211
212 assert(context);
213
214 path = exec_context_tty_path(context);
215
216 if (context->tty_vhangup) {
217 if (p && p->stdin_fd >= 0)
218 (void) terminal_vhangup_fd(p->stdin_fd);
219 else if (path)
220 (void) terminal_vhangup(path);
221 }
222
223 if (context->tty_reset) {
224 if (p && p->stdin_fd >= 0)
225 (void) reset_terminal_fd(p->stdin_fd, true);
226 else if (path)
227 (void) reset_terminal(path);
228 }
229
230 if (context->tty_vt_disallocate && path)
231 (void) vt_disallocate(path);
232 }
233
234 static bool is_terminal_input(ExecInput i) {
235 return IN_SET(i,
236 EXEC_INPUT_TTY,
237 EXEC_INPUT_TTY_FORCE,
238 EXEC_INPUT_TTY_FAIL);
239 }
240
241 static bool is_terminal_output(ExecOutput o) {
242 return IN_SET(o,
243 EXEC_OUTPUT_TTY,
244 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
245 EXEC_OUTPUT_KMSG_AND_CONSOLE,
246 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
247 }
248
249 static bool is_syslog_output(ExecOutput o) {
250 return IN_SET(o,
251 EXEC_OUTPUT_SYSLOG,
252 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
253 }
254
255 static bool is_kmsg_output(ExecOutput o) {
256 return IN_SET(o,
257 EXEC_OUTPUT_KMSG,
258 EXEC_OUTPUT_KMSG_AND_CONSOLE);
259 }
260
261 static bool exec_context_needs_term(const ExecContext *c) {
262 assert(c);
263
264 /* Return true if the execution context suggests we should set $TERM to something useful. */
265
266 if (is_terminal_input(c->std_input))
267 return true;
268
269 if (is_terminal_output(c->std_output))
270 return true;
271
272 if (is_terminal_output(c->std_error))
273 return true;
274
275 return !!c->tty_path;
276 }
277
278 static int open_null_as(int flags, int nfd) {
279 int fd, r;
280
281 assert(nfd >= 0);
282
283 fd = open("/dev/null", flags|O_NOCTTY);
284 if (fd < 0)
285 return -errno;
286
287 if (fd != nfd) {
288 r = dup2(fd, nfd) < 0 ? -errno : nfd;
289 safe_close(fd);
290 } else
291 r = nfd;
292
293 return r;
294 }
295
296 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
297 static const union sockaddr_union sa = {
298 .un.sun_family = AF_UNIX,
299 .un.sun_path = "/run/systemd/journal/stdout",
300 };
301 uid_t olduid = UID_INVALID;
302 gid_t oldgid = GID_INVALID;
303 int r;
304
305 if (gid_is_valid(gid)) {
306 oldgid = getgid();
307
308 if (setegid(gid) < 0)
309 return -errno;
310 }
311
312 if (uid_is_valid(uid)) {
313 olduid = getuid();
314
315 if (seteuid(uid) < 0) {
316 r = -errno;
317 goto restore_gid;
318 }
319 }
320
321 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
322
323 /* If we fail to restore the uid or gid, things will likely
324 fail later on. This should only happen if an LSM interferes. */
325
326 if (uid_is_valid(uid))
327 (void) seteuid(olduid);
328
329 restore_gid:
330 if (gid_is_valid(gid))
331 (void) setegid(oldgid);
332
333 return r;
334 }
335
336 static int connect_logger_as(
337 Unit *unit,
338 const ExecContext *context,
339 const ExecParameters *params,
340 ExecOutput output,
341 const char *ident,
342 int nfd,
343 uid_t uid,
344 gid_t gid) {
345
346 int fd, r;
347
348 assert(context);
349 assert(params);
350 assert(output < _EXEC_OUTPUT_MAX);
351 assert(ident);
352 assert(nfd >= 0);
353
354 fd = socket(AF_UNIX, SOCK_STREAM, 0);
355 if (fd < 0)
356 return -errno;
357
358 r = connect_journal_socket(fd, uid, gid);
359 if (r < 0)
360 return r;
361
362 if (shutdown(fd, SHUT_RD) < 0) {
363 safe_close(fd);
364 return -errno;
365 }
366
367 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
368
369 dprintf(fd,
370 "%s\n"
371 "%s\n"
372 "%i\n"
373 "%i\n"
374 "%i\n"
375 "%i\n"
376 "%i\n",
377 context->syslog_identifier ?: ident,
378 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
379 context->syslog_priority,
380 !!context->syslog_level_prefix,
381 is_syslog_output(output),
382 is_kmsg_output(output),
383 is_terminal_output(output));
384
385 if (fd == nfd)
386 return nfd;
387
388 r = dup2(fd, nfd) < 0 ? -errno : nfd;
389 safe_close(fd);
390
391 return r;
392 }
393 static int open_terminal_as(const char *path, mode_t mode, int nfd) {
394 int fd, r;
395
396 assert(path);
397 assert(nfd >= 0);
398
399 fd = open_terminal(path, mode | O_NOCTTY);
400 if (fd < 0)
401 return fd;
402
403 if (fd != nfd) {
404 r = dup2(fd, nfd) < 0 ? -errno : nfd;
405 safe_close(fd);
406 } else
407 r = nfd;
408
409 return r;
410 }
411
412 static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
413
414 if (is_terminal_input(std_input) && !apply_tty_stdin)
415 return EXEC_INPUT_NULL;
416
417 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
418 return EXEC_INPUT_NULL;
419
420 return std_input;
421 }
422
423 static int fixup_output(ExecOutput std_output, int socket_fd) {
424
425 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
426 return EXEC_OUTPUT_INHERIT;
427
428 return std_output;
429 }
430
431 static int setup_input(
432 const ExecContext *context,
433 const ExecParameters *params,
434 int socket_fd,
435 int named_iofds[3]) {
436
437 ExecInput i;
438
439 assert(context);
440 assert(params);
441
442 if (params->stdin_fd >= 0) {
443 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
444 return -errno;
445
446 /* Try to make this the controlling tty, if it is a tty, and reset it */
447 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
448 (void) reset_terminal_fd(STDIN_FILENO, true);
449
450 return STDIN_FILENO;
451 }
452
453 i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
454
455 switch (i) {
456
457 case EXEC_INPUT_NULL:
458 return open_null_as(O_RDONLY, STDIN_FILENO);
459
460 case EXEC_INPUT_TTY:
461 case EXEC_INPUT_TTY_FORCE:
462 case EXEC_INPUT_TTY_FAIL: {
463 int fd, r;
464
465 fd = acquire_terminal(exec_context_tty_path(context),
466 i == EXEC_INPUT_TTY_FAIL,
467 i == EXEC_INPUT_TTY_FORCE,
468 false,
469 USEC_INFINITY);
470 if (fd < 0)
471 return fd;
472
473 if (fd != STDIN_FILENO) {
474 r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
475 safe_close(fd);
476 } else
477 r = STDIN_FILENO;
478
479 return r;
480 }
481
482 case EXEC_INPUT_SOCKET:
483 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
484
485 case EXEC_INPUT_NAMED_FD:
486 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
487 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
488
489 default:
490 assert_not_reached("Unknown input type");
491 }
492 }
493
494 static int setup_output(
495 Unit *unit,
496 const ExecContext *context,
497 const ExecParameters *params,
498 int fileno,
499 int socket_fd,
500 int named_iofds[3],
501 const char *ident,
502 uid_t uid,
503 gid_t gid,
504 dev_t *journal_stream_dev,
505 ino_t *journal_stream_ino) {
506
507 ExecOutput o;
508 ExecInput i;
509 int r;
510
511 assert(unit);
512 assert(context);
513 assert(params);
514 assert(ident);
515 assert(journal_stream_dev);
516 assert(journal_stream_ino);
517
518 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
519
520 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
521 return -errno;
522
523 return STDOUT_FILENO;
524 }
525
526 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
527 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
528 return -errno;
529
530 return STDERR_FILENO;
531 }
532
533 i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
534 o = fixup_output(context->std_output, socket_fd);
535
536 if (fileno == STDERR_FILENO) {
537 ExecOutput e;
538 e = fixup_output(context->std_error, socket_fd);
539
540 /* This expects the input and output are already set up */
541
542 /* Don't change the stderr file descriptor if we inherit all
543 * the way and are not on a tty */
544 if (e == EXEC_OUTPUT_INHERIT &&
545 o == EXEC_OUTPUT_INHERIT &&
546 i == EXEC_INPUT_NULL &&
547 !is_terminal_input(context->std_input) &&
548 getppid () != 1)
549 return fileno;
550
551 /* Duplicate from stdout if possible */
552 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
553 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
554
555 o = e;
556
557 } else if (o == EXEC_OUTPUT_INHERIT) {
558 /* If input got downgraded, inherit the original value */
559 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
560 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
561
562 /* If the input is connected to anything that's not a /dev/null, inherit that... */
563 if (i != EXEC_INPUT_NULL)
564 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
565
566 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
567 if (getppid() != 1)
568 return fileno;
569
570 /* We need to open /dev/null here anew, to get the right access mode. */
571 return open_null_as(O_WRONLY, fileno);
572 }
573
574 switch (o) {
575
576 case EXEC_OUTPUT_NULL:
577 return open_null_as(O_WRONLY, fileno);
578
579 case EXEC_OUTPUT_TTY:
580 if (is_terminal_input(i))
581 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
582
583 /* We don't reset the terminal if this is just about output */
584 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
585
586 case EXEC_OUTPUT_SYSLOG:
587 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
588 case EXEC_OUTPUT_KMSG:
589 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
590 case EXEC_OUTPUT_JOURNAL:
591 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
592 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
593 if (r < 0) {
594 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
595 r = open_null_as(O_WRONLY, fileno);
596 } else {
597 struct stat st;
598
599 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
600 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
601 * services to detect whether they are connected to the journal or not.
602 *
603 * If both stdout and stderr are connected to a stream then let's make sure to store the data
604 * about STDERR as that's usually the best way to do logging. */
605
606 if (fstat(fileno, &st) >= 0 &&
607 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
608 *journal_stream_dev = st.st_dev;
609 *journal_stream_ino = st.st_ino;
610 }
611 }
612 return r;
613
614 case EXEC_OUTPUT_SOCKET:
615 assert(socket_fd >= 0);
616 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
617
618 case EXEC_OUTPUT_NAMED_FD:
619 (void) fd_nonblock(named_iofds[fileno], false);
620 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
621
622 default:
623 assert_not_reached("Unknown error type");
624 }
625 }
626
627 static int chown_terminal(int fd, uid_t uid) {
628 struct stat st;
629
630 assert(fd >= 0);
631
632 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
633 if (isatty(fd) < 1)
634 return 0;
635
636 /* This might fail. What matters are the results. */
637 (void) fchown(fd, uid, -1);
638 (void) fchmod(fd, TTY_MODE);
639
640 if (fstat(fd, &st) < 0)
641 return -errno;
642
643 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
644 return -EPERM;
645
646 return 0;
647 }
648
649 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
650 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
651 int r;
652
653 assert(_saved_stdin);
654 assert(_saved_stdout);
655
656 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
657 if (saved_stdin < 0)
658 return -errno;
659
660 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
661 if (saved_stdout < 0)
662 return -errno;
663
664 fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
665 if (fd < 0)
666 return fd;
667
668 r = chown_terminal(fd, getuid());
669 if (r < 0)
670 return r;
671
672 r = reset_terminal_fd(fd, true);
673 if (r < 0)
674 return r;
675
676 if (dup2(fd, STDIN_FILENO) < 0)
677 return -errno;
678
679 if (dup2(fd, STDOUT_FILENO) < 0)
680 return -errno;
681
682 if (fd >= 2)
683 safe_close(fd);
684 fd = -1;
685
686 *_saved_stdin = saved_stdin;
687 *_saved_stdout = saved_stdout;
688
689 saved_stdin = saved_stdout = -1;
690
691 return 0;
692 }
693
694 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
695 assert(err < 0);
696
697 if (err == -ETIMEDOUT)
698 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
699 else {
700 errno = -err;
701 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
702 }
703 }
704
705 static void write_confirm_error(int err, const char *vc, const Unit *u) {
706 _cleanup_close_ int fd = -1;
707
708 assert(vc);
709
710 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
711 if (fd < 0)
712 return;
713
714 write_confirm_error_fd(err, fd, u);
715 }
716
717 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
718 int r = 0;
719
720 assert(saved_stdin);
721 assert(saved_stdout);
722
723 release_terminal();
724
725 if (*saved_stdin >= 0)
726 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
727 r = -errno;
728
729 if (*saved_stdout >= 0)
730 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
731 r = -errno;
732
733 *saved_stdin = safe_close(*saved_stdin);
734 *saved_stdout = safe_close(*saved_stdout);
735
736 return r;
737 }
738
739 enum {
740 CONFIRM_PRETEND_FAILURE = -1,
741 CONFIRM_PRETEND_SUCCESS = 0,
742 CONFIRM_EXECUTE = 1,
743 };
744
745 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
746 int saved_stdout = -1, saved_stdin = -1, r;
747 _cleanup_free_ char *e = NULL;
748 char c;
749
750 /* For any internal errors, assume a positive response. */
751 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
752 if (r < 0) {
753 write_confirm_error(r, vc, u);
754 return CONFIRM_EXECUTE;
755 }
756
757 /* confirm_spawn might have been disabled while we were sleeping. */
758 if (manager_is_confirm_spawn_disabled(u->manager)) {
759 r = 1;
760 goto restore_stdio;
761 }
762
763 e = ellipsize(cmdline, 60, 100);
764 if (!e) {
765 log_oom();
766 r = CONFIRM_EXECUTE;
767 goto restore_stdio;
768 }
769
770 for (;;) {
771 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
772 if (r < 0) {
773 write_confirm_error_fd(r, STDOUT_FILENO, u);
774 r = CONFIRM_EXECUTE;
775 goto restore_stdio;
776 }
777
778 switch (c) {
779 case 'c':
780 printf("Resuming normal execution.\n");
781 manager_disable_confirm_spawn();
782 r = 1;
783 break;
784 case 'D':
785 unit_dump(u, stdout, " ");
786 continue; /* ask again */
787 case 'f':
788 printf("Failing execution.\n");
789 r = CONFIRM_PRETEND_FAILURE;
790 break;
791 case 'h':
792 printf(" c - continue, proceed without asking anymore\n"
793 " D - dump, show the state of the unit\n"
794 " f - fail, don't execute the command and pretend it failed\n"
795 " h - help\n"
796 " i - info, show a short summary of the unit\n"
797 " j - jobs, show jobs that are in progress\n"
798 " s - skip, don't execute the command and pretend it succeeded\n"
799 " y - yes, execute the command\n");
800 continue; /* ask again */
801 case 'i':
802 printf(" Description: %s\n"
803 " Unit: %s\n"
804 " Command: %s\n",
805 u->id, u->description, cmdline);
806 continue; /* ask again */
807 case 'j':
808 manager_dump_jobs(u->manager, stdout, " ");
809 continue; /* ask again */
810 case 'n':
811 /* 'n' was removed in favor of 'f'. */
812 printf("Didn't understand 'n', did you mean 'f'?\n");
813 continue; /* ask again */
814 case 's':
815 printf("Skipping execution.\n");
816 r = CONFIRM_PRETEND_SUCCESS;
817 break;
818 case 'y':
819 r = CONFIRM_EXECUTE;
820 break;
821 default:
822 assert_not_reached("Unhandled choice");
823 }
824 break;
825 }
826
827 restore_stdio:
828 restore_confirm_stdio(&saved_stdin, &saved_stdout);
829 return r;
830 }
831
832 static int get_fixed_user(const ExecContext *c, const char **user,
833 uid_t *uid, gid_t *gid,
834 const char **home, const char **shell) {
835 int r;
836 const char *name;
837
838 assert(c);
839
840 if (!c->user)
841 return 0;
842
843 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
844 * (i.e. are "/" or "/bin/nologin"). */
845
846 name = c->user;
847 r = get_user_creds_clean(&name, uid, gid, home, shell);
848 if (r < 0)
849 return r;
850
851 *user = name;
852 return 0;
853 }
854
855 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
856 int r;
857 const char *name;
858
859 assert(c);
860
861 if (!c->group)
862 return 0;
863
864 name = c->group;
865 r = get_group_creds(&name, gid);
866 if (r < 0)
867 return r;
868
869 *group = name;
870 return 0;
871 }
872
873 static int get_supplementary_groups(const ExecContext *c, const char *user,
874 const char *group, gid_t gid,
875 gid_t **supplementary_gids, int *ngids) {
876 char **i;
877 int r, k = 0;
878 int ngroups_max;
879 bool keep_groups = false;
880 gid_t *groups = NULL;
881 _cleanup_free_ gid_t *l_gids = NULL;
882
883 assert(c);
884
885 /*
886 * If user is given, then lookup GID and supplementary groups list.
887 * We avoid NSS lookups for gid=0. Also we have to initialize groups
888 * here and as early as possible so we keep the list of supplementary
889 * groups of the caller.
890 */
891 if (user && gid_is_valid(gid) && gid != 0) {
892 /* First step, initialize groups from /etc/groups */
893 if (initgroups(user, gid) < 0)
894 return -errno;
895
896 keep_groups = true;
897 }
898
899 if (strv_isempty(c->supplementary_groups))
900 return 0;
901
902 /*
903 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
904 * be positive, otherwise fail.
905 */
906 errno = 0;
907 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
908 if (ngroups_max <= 0) {
909 if (errno > 0)
910 return -errno;
911 else
912 return -EOPNOTSUPP; /* For all other values */
913 }
914
915 l_gids = new(gid_t, ngroups_max);
916 if (!l_gids)
917 return -ENOMEM;
918
919 if (keep_groups) {
920 /*
921 * Lookup the list of groups that the user belongs to, we
922 * avoid NSS lookups here too for gid=0.
923 */
924 k = ngroups_max;
925 if (getgrouplist(user, gid, l_gids, &k) < 0)
926 return -EINVAL;
927 } else
928 k = 0;
929
930 STRV_FOREACH(i, c->supplementary_groups) {
931 const char *g;
932
933 if (k >= ngroups_max)
934 return -E2BIG;
935
936 g = *i;
937 r = get_group_creds(&g, l_gids+k);
938 if (r < 0)
939 return r;
940
941 k++;
942 }
943
944 /*
945 * Sets ngids to zero to drop all supplementary groups, happens
946 * when we are under root and SupplementaryGroups= is empty.
947 */
948 if (k == 0) {
949 *ngids = 0;
950 return 0;
951 }
952
953 /* Otherwise get the final list of supplementary groups */
954 groups = memdup(l_gids, sizeof(gid_t) * k);
955 if (!groups)
956 return -ENOMEM;
957
958 *supplementary_gids = groups;
959 *ngids = k;
960
961 groups = NULL;
962
963 return 0;
964 }
965
966 static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
967 int r;
968
969 /* Handle SupplementaryGroups= if it is not empty */
970 if (ngids > 0) {
971 r = maybe_setgroups(ngids, supplementary_gids);
972 if (r < 0)
973 return r;
974 }
975
976 if (gid_is_valid(gid)) {
977 /* Then set our gids */
978 if (setresgid(gid, gid, gid) < 0)
979 return -errno;
980 }
981
982 return 0;
983 }
984
985 static int enforce_user(const ExecContext *context, uid_t uid) {
986 assert(context);
987
988 if (!uid_is_valid(uid))
989 return 0;
990
991 /* Sets (but doesn't look up) the uid and make sure we keep the
992 * capabilities while doing so. */
993
994 if (context->capability_ambient_set != 0) {
995
996 /* First step: If we need to keep capabilities but
997 * drop privileges we need to make sure we keep our
998 * caps, while we drop privileges. */
999 if (uid != 0) {
1000 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1001
1002 if (prctl(PR_GET_SECUREBITS) != sb)
1003 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1004 return -errno;
1005 }
1006 }
1007
1008 /* Second step: actually set the uids */
1009 if (setresuid(uid, uid, uid) < 0)
1010 return -errno;
1011
1012 /* At this point we should have all necessary capabilities but
1013 are otherwise a normal user. However, the caps might got
1014 corrupted due to the setresuid() so we need clean them up
1015 later. This is done outside of this call. */
1016
1017 return 0;
1018 }
1019
1020 #if HAVE_PAM
1021
1022 static int null_conv(
1023 int num_msg,
1024 const struct pam_message **msg,
1025 struct pam_response **resp,
1026 void *appdata_ptr) {
1027
1028 /* We don't support conversations */
1029
1030 return PAM_CONV_ERR;
1031 }
1032
1033 #endif
1034
1035 static int setup_pam(
1036 const char *name,
1037 const char *user,
1038 uid_t uid,
1039 gid_t gid,
1040 const char *tty,
1041 char ***env,
1042 int fds[], unsigned n_fds) {
1043
1044 #if HAVE_PAM
1045
1046 static const struct pam_conv conv = {
1047 .conv = null_conv,
1048 .appdata_ptr = NULL
1049 };
1050
1051 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1052 pam_handle_t *handle = NULL;
1053 sigset_t old_ss;
1054 int pam_code = PAM_SUCCESS, r;
1055 char **nv, **e = NULL;
1056 bool close_session = false;
1057 pid_t pam_pid = 0, parent_pid;
1058 int flags = 0;
1059
1060 assert(name);
1061 assert(user);
1062 assert(env);
1063
1064 /* We set up PAM in the parent process, then fork. The child
1065 * will then stay around until killed via PR_GET_PDEATHSIG or
1066 * systemd via the cgroup logic. It will then remove the PAM
1067 * session again. The parent process will exec() the actual
1068 * daemon. We do things this way to ensure that the main PID
1069 * of the daemon is the one we initially fork()ed. */
1070
1071 r = barrier_create(&barrier);
1072 if (r < 0)
1073 goto fail;
1074
1075 if (log_get_max_level() < LOG_DEBUG)
1076 flags |= PAM_SILENT;
1077
1078 pam_code = pam_start(name, user, &conv, &handle);
1079 if (pam_code != PAM_SUCCESS) {
1080 handle = NULL;
1081 goto fail;
1082 }
1083
1084 if (tty) {
1085 pam_code = pam_set_item(handle, PAM_TTY, tty);
1086 if (pam_code != PAM_SUCCESS)
1087 goto fail;
1088 }
1089
1090 STRV_FOREACH(nv, *env) {
1091 pam_code = pam_putenv(handle, *nv);
1092 if (pam_code != PAM_SUCCESS)
1093 goto fail;
1094 }
1095
1096 pam_code = pam_acct_mgmt(handle, flags);
1097 if (pam_code != PAM_SUCCESS)
1098 goto fail;
1099
1100 pam_code = pam_open_session(handle, flags);
1101 if (pam_code != PAM_SUCCESS)
1102 goto fail;
1103
1104 close_session = true;
1105
1106 e = pam_getenvlist(handle);
1107 if (!e) {
1108 pam_code = PAM_BUF_ERR;
1109 goto fail;
1110 }
1111
1112 /* Block SIGTERM, so that we know that it won't get lost in
1113 * the child */
1114
1115 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1116
1117 parent_pid = getpid_cached();
1118
1119 pam_pid = fork();
1120 if (pam_pid < 0) {
1121 r = -errno;
1122 goto fail;
1123 }
1124
1125 if (pam_pid == 0) {
1126 int sig, ret = EXIT_PAM;
1127
1128 /* The child's job is to reset the PAM session on
1129 * termination */
1130 barrier_set_role(&barrier, BARRIER_CHILD);
1131
1132 /* This string must fit in 10 chars (i.e. the length
1133 * of "/sbin/init"), to look pretty in /bin/ps */
1134 rename_process("(sd-pam)");
1135
1136 /* Make sure we don't keep open the passed fds in this
1137 child. We assume that otherwise only those fds are
1138 open here that have been opened by PAM. */
1139 close_many(fds, n_fds);
1140
1141 /* Drop privileges - we don't need any to pam_close_session
1142 * and this will make PR_SET_PDEATHSIG work in most cases.
1143 * If this fails, ignore the error - but expect sd-pam threads
1144 * to fail to exit normally */
1145
1146 r = maybe_setgroups(0, NULL);
1147 if (r < 0)
1148 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1149 if (setresgid(gid, gid, gid) < 0)
1150 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1151 if (setresuid(uid, uid, uid) < 0)
1152 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1153
1154 (void) ignore_signals(SIGPIPE, -1);
1155
1156 /* Wait until our parent died. This will only work if
1157 * the above setresuid() succeeds, otherwise the kernel
1158 * will not allow unprivileged parents kill their privileged
1159 * children this way. We rely on the control groups kill logic
1160 * to do the rest for us. */
1161 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1162 goto child_finish;
1163
1164 /* Tell the parent that our setup is done. This is especially
1165 * important regarding dropping privileges. Otherwise, unit
1166 * setup might race against our setresuid(2) call.
1167 *
1168 * If the parent aborted, we'll detect this below, hence ignore
1169 * return failure here. */
1170 (void) barrier_place(&barrier);
1171
1172 /* Check if our parent process might already have died? */
1173 if (getppid() == parent_pid) {
1174 sigset_t ss;
1175
1176 assert_se(sigemptyset(&ss) >= 0);
1177 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1178
1179 for (;;) {
1180 if (sigwait(&ss, &sig) < 0) {
1181 if (errno == EINTR)
1182 continue;
1183
1184 goto child_finish;
1185 }
1186
1187 assert(sig == SIGTERM);
1188 break;
1189 }
1190 }
1191
1192 /* If our parent died we'll end the session */
1193 if (getppid() != parent_pid) {
1194 pam_code = pam_close_session(handle, flags);
1195 if (pam_code != PAM_SUCCESS)
1196 goto child_finish;
1197 }
1198
1199 ret = 0;
1200
1201 child_finish:
1202 pam_end(handle, pam_code | flags);
1203 _exit(ret);
1204 }
1205
1206 barrier_set_role(&barrier, BARRIER_PARENT);
1207
1208 /* If the child was forked off successfully it will do all the
1209 * cleanups, so forget about the handle here. */
1210 handle = NULL;
1211
1212 /* Unblock SIGTERM again in the parent */
1213 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1214
1215 /* We close the log explicitly here, since the PAM modules
1216 * might have opened it, but we don't want this fd around. */
1217 closelog();
1218
1219 /* Synchronously wait for the child to initialize. We don't care for
1220 * errors as we cannot recover. However, warn loudly if it happens. */
1221 if (!barrier_place_and_sync(&barrier))
1222 log_error("PAM initialization failed");
1223
1224 strv_free(*env);
1225 *env = e;
1226
1227 return 0;
1228
1229 fail:
1230 if (pam_code != PAM_SUCCESS) {
1231 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1232 r = -EPERM; /* PAM errors do not map to errno */
1233 } else
1234 log_error_errno(r, "PAM failed: %m");
1235
1236 if (handle) {
1237 if (close_session)
1238 pam_code = pam_close_session(handle, flags);
1239
1240 pam_end(handle, pam_code | flags);
1241 }
1242
1243 strv_free(e);
1244 closelog();
1245
1246 return r;
1247 #else
1248 return 0;
1249 #endif
1250 }
1251
1252 static void rename_process_from_path(const char *path) {
1253 char process_name[11];
1254 const char *p;
1255 size_t l;
1256
1257 /* This resulting string must fit in 10 chars (i.e. the length
1258 * of "/sbin/init") to look pretty in /bin/ps */
1259
1260 p = basename(path);
1261 if (isempty(p)) {
1262 rename_process("(...)");
1263 return;
1264 }
1265
1266 l = strlen(p);
1267 if (l > 8) {
1268 /* The end of the process name is usually more
1269 * interesting, since the first bit might just be
1270 * "systemd-" */
1271 p = p + l - 8;
1272 l = 8;
1273 }
1274
1275 process_name[0] = '(';
1276 memcpy(process_name+1, p, l);
1277 process_name[1+l] = ')';
1278 process_name[1+l+1] = 0;
1279
1280 rename_process(process_name);
1281 }
1282
1283 static bool context_has_address_families(const ExecContext *c) {
1284 assert(c);
1285
1286 return c->address_families_whitelist ||
1287 !set_isempty(c->address_families);
1288 }
1289
1290 static bool context_has_syscall_filters(const ExecContext *c) {
1291 assert(c);
1292
1293 return c->syscall_whitelist ||
1294 !hashmap_isempty(c->syscall_filter);
1295 }
1296
1297 static bool context_has_no_new_privileges(const ExecContext *c) {
1298 assert(c);
1299
1300 if (c->no_new_privileges)
1301 return true;
1302
1303 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1304 return false;
1305
1306 /* We need NNP if we have any form of seccomp and are unprivileged */
1307 return context_has_address_families(c) ||
1308 c->memory_deny_write_execute ||
1309 c->restrict_realtime ||
1310 exec_context_restrict_namespaces_set(c) ||
1311 c->protect_kernel_tunables ||
1312 c->protect_kernel_modules ||
1313 c->private_devices ||
1314 context_has_syscall_filters(c) ||
1315 !set_isempty(c->syscall_archs) ||
1316 c->lock_personality;
1317 }
1318
1319 #if HAVE_SECCOMP
1320
1321 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1322
1323 if (is_seccomp_available())
1324 return false;
1325
1326 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1327 return true;
1328 }
1329
1330 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1331 uint32_t negative_action, default_action, action;
1332 int r;
1333
1334 assert(u);
1335 assert(c);
1336
1337 if (!context_has_syscall_filters(c))
1338 return 0;
1339
1340 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1341 return 0;
1342
1343 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1344
1345 if (c->syscall_whitelist) {
1346 default_action = negative_action;
1347 action = SCMP_ACT_ALLOW;
1348 } else {
1349 default_action = SCMP_ACT_ALLOW;
1350 action = negative_action;
1351 }
1352
1353 if (needs_ambient_hack) {
1354 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1355 if (r < 0)
1356 return r;
1357 }
1358
1359 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1360 }
1361
1362 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1363 assert(u);
1364 assert(c);
1365
1366 if (set_isempty(c->syscall_archs))
1367 return 0;
1368
1369 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1370 return 0;
1371
1372 return seccomp_restrict_archs(c->syscall_archs);
1373 }
1374
1375 static int apply_address_families(const Unit* u, const ExecContext *c) {
1376 assert(u);
1377 assert(c);
1378
1379 if (!context_has_address_families(c))
1380 return 0;
1381
1382 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1383 return 0;
1384
1385 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1386 }
1387
1388 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1389 assert(u);
1390 assert(c);
1391
1392 if (!c->memory_deny_write_execute)
1393 return 0;
1394
1395 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1396 return 0;
1397
1398 return seccomp_memory_deny_write_execute();
1399 }
1400
1401 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1402 assert(u);
1403 assert(c);
1404
1405 if (!c->restrict_realtime)
1406 return 0;
1407
1408 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1409 return 0;
1410
1411 return seccomp_restrict_realtime();
1412 }
1413
1414 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1415 assert(u);
1416 assert(c);
1417
1418 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1419 * let's protect even those systems where this is left on in the kernel. */
1420
1421 if (!c->protect_kernel_tunables)
1422 return 0;
1423
1424 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1425 return 0;
1426
1427 return seccomp_protect_sysctl();
1428 }
1429
1430 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1431 assert(u);
1432 assert(c);
1433
1434 /* Turn off module syscalls on ProtectKernelModules=yes */
1435
1436 if (!c->protect_kernel_modules)
1437 return 0;
1438
1439 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1440 return 0;
1441
1442 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1443 }
1444
1445 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1446 assert(u);
1447 assert(c);
1448
1449 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1450
1451 if (!c->private_devices)
1452 return 0;
1453
1454 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1455 return 0;
1456
1457 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1458 }
1459
1460 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1461 assert(u);
1462 assert(c);
1463
1464 if (!exec_context_restrict_namespaces_set(c))
1465 return 0;
1466
1467 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1468 return 0;
1469
1470 return seccomp_restrict_namespaces(c->restrict_namespaces);
1471 }
1472
1473 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1474 unsigned long personality;
1475 int r;
1476
1477 assert(u);
1478 assert(c);
1479
1480 if (!c->lock_personality)
1481 return 0;
1482
1483 if (skip_seccomp_unavailable(u, "LockPersonality="))
1484 return 0;
1485
1486 personality = c->personality;
1487
1488 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1489 if (personality == PERSONALITY_INVALID) {
1490
1491 r = opinionated_personality(&personality);
1492 if (r < 0)
1493 return r;
1494 }
1495
1496 return seccomp_lock_personality(personality);
1497 }
1498
1499 #endif
1500
1501 static void do_idle_pipe_dance(int idle_pipe[4]) {
1502 assert(idle_pipe);
1503
1504 idle_pipe[1] = safe_close(idle_pipe[1]);
1505 idle_pipe[2] = safe_close(idle_pipe[2]);
1506
1507 if (idle_pipe[0] >= 0) {
1508 int r;
1509
1510 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1511
1512 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1513 ssize_t n;
1514
1515 /* Signal systemd that we are bored and want to continue. */
1516 n = write(idle_pipe[3], "x", 1);
1517 if (n > 0)
1518 /* Wait for systemd to react to the signal above. */
1519 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1520 }
1521
1522 idle_pipe[0] = safe_close(idle_pipe[0]);
1523
1524 }
1525
1526 idle_pipe[3] = safe_close(idle_pipe[3]);
1527 }
1528
1529 static int build_environment(
1530 Unit *u,
1531 const ExecContext *c,
1532 const ExecParameters *p,
1533 unsigned n_fds,
1534 const char *home,
1535 const char *username,
1536 const char *shell,
1537 dev_t journal_stream_dev,
1538 ino_t journal_stream_ino,
1539 char ***ret) {
1540
1541 _cleanup_strv_free_ char **our_env = NULL;
1542 unsigned n_env = 0;
1543 char *x;
1544
1545 assert(u);
1546 assert(c);
1547 assert(ret);
1548
1549 our_env = new0(char*, 14);
1550 if (!our_env)
1551 return -ENOMEM;
1552
1553 if (n_fds > 0) {
1554 _cleanup_free_ char *joined = NULL;
1555
1556 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1557 return -ENOMEM;
1558 our_env[n_env++] = x;
1559
1560 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1561 return -ENOMEM;
1562 our_env[n_env++] = x;
1563
1564 joined = strv_join(p->fd_names, ":");
1565 if (!joined)
1566 return -ENOMEM;
1567
1568 x = strjoin("LISTEN_FDNAMES=", joined);
1569 if (!x)
1570 return -ENOMEM;
1571 our_env[n_env++] = x;
1572 }
1573
1574 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1575 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1576 return -ENOMEM;
1577 our_env[n_env++] = x;
1578
1579 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1580 return -ENOMEM;
1581 our_env[n_env++] = x;
1582 }
1583
1584 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1585 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1586 * check the database directly. */
1587 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1588 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1589 if (!x)
1590 return -ENOMEM;
1591 our_env[n_env++] = x;
1592 }
1593
1594 if (home) {
1595 x = strappend("HOME=", home);
1596 if (!x)
1597 return -ENOMEM;
1598 our_env[n_env++] = x;
1599 }
1600
1601 if (username) {
1602 x = strappend("LOGNAME=", username);
1603 if (!x)
1604 return -ENOMEM;
1605 our_env[n_env++] = x;
1606
1607 x = strappend("USER=", username);
1608 if (!x)
1609 return -ENOMEM;
1610 our_env[n_env++] = x;
1611 }
1612
1613 if (shell) {
1614 x = strappend("SHELL=", shell);
1615 if (!x)
1616 return -ENOMEM;
1617 our_env[n_env++] = x;
1618 }
1619
1620 if (!sd_id128_is_null(u->invocation_id)) {
1621 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1622 return -ENOMEM;
1623
1624 our_env[n_env++] = x;
1625 }
1626
1627 if (exec_context_needs_term(c)) {
1628 const char *tty_path, *term = NULL;
1629
1630 tty_path = exec_context_tty_path(c);
1631
1632 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1633 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1634 * passes to PID 1 ends up all the way in the console login shown. */
1635
1636 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1637 term = getenv("TERM");
1638 if (!term)
1639 term = default_term_for_tty(tty_path);
1640
1641 x = strappend("TERM=", term);
1642 if (!x)
1643 return -ENOMEM;
1644 our_env[n_env++] = x;
1645 }
1646
1647 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1648 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1649 return -ENOMEM;
1650
1651 our_env[n_env++] = x;
1652 }
1653
1654 our_env[n_env++] = NULL;
1655 assert(n_env <= 12);
1656
1657 *ret = our_env;
1658 our_env = NULL;
1659
1660 return 0;
1661 }
1662
1663 static int build_pass_environment(const ExecContext *c, char ***ret) {
1664 _cleanup_strv_free_ char **pass_env = NULL;
1665 size_t n_env = 0, n_bufsize = 0;
1666 char **i;
1667
1668 STRV_FOREACH(i, c->pass_environment) {
1669 _cleanup_free_ char *x = NULL;
1670 char *v;
1671
1672 v = getenv(*i);
1673 if (!v)
1674 continue;
1675 x = strjoin(*i, "=", v);
1676 if (!x)
1677 return -ENOMEM;
1678
1679 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1680 return -ENOMEM;
1681
1682 pass_env[n_env++] = x;
1683 pass_env[n_env] = NULL;
1684 x = NULL;
1685 }
1686
1687 *ret = pass_env;
1688 pass_env = NULL;
1689
1690 return 0;
1691 }
1692
1693 static bool exec_needs_mount_namespace(
1694 const ExecContext *context,
1695 const ExecParameters *params,
1696 ExecRuntime *runtime) {
1697
1698 assert(context);
1699 assert(params);
1700
1701 if (context->root_image)
1702 return true;
1703
1704 if (!strv_isempty(context->read_write_paths) ||
1705 !strv_isempty(context->read_only_paths) ||
1706 !strv_isempty(context->inaccessible_paths))
1707 return true;
1708
1709 if (context->n_bind_mounts > 0 ||
1710 !strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
1711 !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1712 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1713 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths) ||
1714 !strv_isempty(context->directories[EXEC_DIRECTORY_CONFIGURATION].paths))
1715 return true;
1716
1717 if (context->mount_flags != 0)
1718 return true;
1719
1720 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1721 return true;
1722
1723 if (context->private_devices ||
1724 context->protect_system != PROTECT_SYSTEM_NO ||
1725 context->protect_home != PROTECT_HOME_NO ||
1726 context->protect_kernel_tunables ||
1727 context->protect_kernel_modules ||
1728 context->protect_control_groups)
1729 return true;
1730
1731 if (context->mount_apivfs && (context->root_image || context->root_directory))
1732 return true;
1733
1734 return false;
1735 }
1736
1737 static int setup_private_users(uid_t uid, gid_t gid) {
1738 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1739 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1740 _cleanup_close_ int unshare_ready_fd = -1;
1741 _cleanup_(sigkill_waitp) pid_t pid = 0;
1742 uint64_t c = 1;
1743 siginfo_t si;
1744 ssize_t n;
1745 int r;
1746
1747 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1748 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1749 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1750 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1751 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1752 * continues execution normally. */
1753
1754 if (uid != 0 && uid_is_valid(uid)) {
1755 r = asprintf(&uid_map,
1756 "0 0 1\n" /* Map root → root */
1757 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1758 uid, uid);
1759 if (r < 0)
1760 return -ENOMEM;
1761 } else {
1762 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1763 if (!uid_map)
1764 return -ENOMEM;
1765 }
1766
1767 if (gid != 0 && gid_is_valid(gid)) {
1768 r = asprintf(&gid_map,
1769 "0 0 1\n" /* Map root → root */
1770 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1771 gid, gid);
1772 if (r < 0)
1773 return -ENOMEM;
1774 } else {
1775 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1776 if (!gid_map)
1777 return -ENOMEM;
1778 }
1779
1780 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1781 * namespace. */
1782 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1783 if (unshare_ready_fd < 0)
1784 return -errno;
1785
1786 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1787 * failed. */
1788 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1789 return -errno;
1790
1791 pid = fork();
1792 if (pid < 0)
1793 return -errno;
1794
1795 if (pid == 0) {
1796 _cleanup_close_ int fd = -1;
1797 const char *a;
1798 pid_t ppid;
1799
1800 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1801 * here, after the parent opened its own user namespace. */
1802
1803 ppid = getppid();
1804 errno_pipe[0] = safe_close(errno_pipe[0]);
1805
1806 /* Wait until the parent unshared the user namespace */
1807 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1808 r = -errno;
1809 goto child_fail;
1810 }
1811
1812 /* Disable the setgroups() system call in the child user namespace, for good. */
1813 a = procfs_file_alloca(ppid, "setgroups");
1814 fd = open(a, O_WRONLY|O_CLOEXEC);
1815 if (fd < 0) {
1816 if (errno != ENOENT) {
1817 r = -errno;
1818 goto child_fail;
1819 }
1820
1821 /* If the file is missing the kernel is too old, let's continue anyway. */
1822 } else {
1823 if (write(fd, "deny\n", 5) < 0) {
1824 r = -errno;
1825 goto child_fail;
1826 }
1827
1828 fd = safe_close(fd);
1829 }
1830
1831 /* First write the GID map */
1832 a = procfs_file_alloca(ppid, "gid_map");
1833 fd = open(a, O_WRONLY|O_CLOEXEC);
1834 if (fd < 0) {
1835 r = -errno;
1836 goto child_fail;
1837 }
1838 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1839 r = -errno;
1840 goto child_fail;
1841 }
1842 fd = safe_close(fd);
1843
1844 /* The write the UID map */
1845 a = procfs_file_alloca(ppid, "uid_map");
1846 fd = open(a, O_WRONLY|O_CLOEXEC);
1847 if (fd < 0) {
1848 r = -errno;
1849 goto child_fail;
1850 }
1851 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1852 r = -errno;
1853 goto child_fail;
1854 }
1855
1856 _exit(EXIT_SUCCESS);
1857
1858 child_fail:
1859 (void) write(errno_pipe[1], &r, sizeof(r));
1860 _exit(EXIT_FAILURE);
1861 }
1862
1863 errno_pipe[1] = safe_close(errno_pipe[1]);
1864
1865 if (unshare(CLONE_NEWUSER) < 0)
1866 return -errno;
1867
1868 /* Let the child know that the namespace is ready now */
1869 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1870 return -errno;
1871
1872 /* Try to read an error code from the child */
1873 n = read(errno_pipe[0], &r, sizeof(r));
1874 if (n < 0)
1875 return -errno;
1876 if (n == sizeof(r)) { /* an error code was sent to us */
1877 if (r < 0)
1878 return r;
1879 return -EIO;
1880 }
1881 if (n != 0) /* on success we should have read 0 bytes */
1882 return -EIO;
1883
1884 r = wait_for_terminate(pid, &si);
1885 if (r < 0)
1886 return r;
1887 pid = 0;
1888
1889 /* If something strange happened with the child, let's consider this fatal, too */
1890 if (si.si_code != CLD_EXITED || si.si_status != 0)
1891 return -EIO;
1892
1893 return 0;
1894 }
1895
1896 static int setup_exec_directory(
1897 const ExecContext *context,
1898 const ExecParameters *params,
1899 uid_t uid,
1900 gid_t gid,
1901 ExecDirectoryType type,
1902 int *exit_status) {
1903
1904 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1905 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1906 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1907 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1908 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1909 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1910 };
1911 char **rt;
1912 int r;
1913
1914 assert(context);
1915 assert(params);
1916 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1917 assert(exit_status);
1918
1919 if (!params->prefix[type])
1920 return 0;
1921
1922 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1923 if (!uid_is_valid(uid))
1924 uid = 0;
1925 if (!gid_is_valid(gid))
1926 gid = 0;
1927 }
1928
1929 STRV_FOREACH(rt, context->directories[type].paths) {
1930 _cleanup_free_ char *p = NULL, *pp = NULL;
1931 const char *effective;
1932
1933 p = strjoin(params->prefix[type], "/", *rt);
1934 if (!p) {
1935 r = -ENOMEM;
1936 goto fail;
1937 }
1938
1939 r = mkdir_parents_label(p, 0755);
1940 if (r < 0)
1941 goto fail;
1942
1943 if (context->dynamic_user &&
1944 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
1945 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
1946
1947 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1948 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1949 * whose UID is later on reused. To lock this down we use the same trick used by container
1950 * managers to prohibit host users to get access to files of the same UID in containers: we
1951 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1952 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1953 * to make this directory permeable for the service itself.
1954 *
1955 * Specifically: for a service which wants a special directory "foo/" we first create a
1956 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1957 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1958 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1959 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1960 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1961 * disabling the access boundary for the service and making sure it only gets access to the
1962 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1963 *
1964 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1965 * owned by the service itself.
1966 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
1967 * files or sockets with other services. */
1968
1969 private_root = strjoin(params->prefix[type], "/private");
1970 if (!private_root) {
1971 r = -ENOMEM;
1972 goto fail;
1973 }
1974
1975 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1976 r = mkdir_safe_label(private_root, 0700, 0, 0, false);
1977 if (r < 0)
1978 goto fail;
1979
1980 pp = strjoin(private_root, "/", *rt);
1981 if (!pp) {
1982 r = -ENOMEM;
1983 goto fail;
1984 }
1985
1986 /* Create all directories between the configured directory and this private root, and mark them 0755 */
1987 r = mkdir_parents_label(pp, 0755);
1988 if (r < 0)
1989 goto fail;
1990
1991 /* Finally, create the actual directory for the service */
1992 r = mkdir_label(pp, context->directories[type].mode);
1993 if (r < 0 && r != -EEXIST)
1994 goto fail;
1995
1996 parent = dirname_malloc(p);
1997 if (!parent) {
1998 r = -ENOMEM;
1999 goto fail;
2000 }
2001
2002 r = path_make_relative(parent, pp, &relative);
2003 if (r < 0)
2004 goto fail;
2005
2006 /* And link it up from the original place */
2007 r = symlink_idempotent(relative, p);
2008 if (r < 0)
2009 goto fail;
2010
2011 effective = pp;
2012
2013 } else {
2014 r = mkdir_label(p, context->directories[type].mode);
2015 if (r < 0 && r != -EEXIST)
2016 goto fail;
2017
2018 effective = p;
2019 }
2020
2021 /* First lock down the access mode */
2022 if (chmod(effective, context->directories[type].mode) < 0) {
2023 r = -errno;
2024 goto fail;
2025 }
2026
2027 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2028 * a service, and shall not be writable. */
2029 if (type == EXEC_DIRECTORY_CONFIGURATION)
2030 continue;
2031
2032 /* Then, change the ownership of the whole tree, if necessary */
2033 r = path_chown_recursive(effective, uid, gid);
2034 if (r < 0)
2035 goto fail;
2036 }
2037
2038 return 0;
2039
2040 fail:
2041 *exit_status = exit_status_table[type];
2042 return r;
2043 }
2044
2045 static int setup_smack(
2046 const ExecContext *context,
2047 const ExecCommand *command) {
2048
2049 int r;
2050
2051 assert(context);
2052 assert(command);
2053
2054 if (context->smack_process_label) {
2055 r = mac_smack_apply_pid(0, context->smack_process_label);
2056 if (r < 0)
2057 return r;
2058 }
2059 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2060 else {
2061 _cleanup_free_ char *exec_label = NULL;
2062
2063 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2064 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2065 return r;
2066
2067 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2068 if (r < 0)
2069 return r;
2070 }
2071 #endif
2072
2073 return 0;
2074 }
2075
2076 static int compile_bind_mounts(
2077 const ExecContext *context,
2078 const ExecParameters *params,
2079 BindMount **ret_bind_mounts,
2080 unsigned *ret_n_bind_mounts,
2081 char ***ret_empty_directories) {
2082
2083 _cleanup_strv_free_ char **empty_directories = NULL;
2084 BindMount *bind_mounts;
2085 unsigned n, h = 0, i;
2086 ExecDirectoryType t;
2087 int r;
2088
2089 assert(context);
2090 assert(params);
2091 assert(ret_bind_mounts);
2092 assert(ret_n_bind_mounts);
2093 assert(ret_empty_directories);
2094
2095 n = context->n_bind_mounts;
2096 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2097 if (!params->prefix[t])
2098 continue;
2099
2100 n += strv_length(context->directories[t].paths);
2101 }
2102
2103 if (n <= 0) {
2104 *ret_bind_mounts = NULL;
2105 *ret_n_bind_mounts = 0;
2106 *ret_empty_directories = NULL;
2107 return 0;
2108 }
2109
2110 bind_mounts = new(BindMount, n);
2111 if (!bind_mounts)
2112 return -ENOMEM;
2113
2114 for (i = 0; i < context->n_bind_mounts; i++) {
2115 BindMount *item = context->bind_mounts + i;
2116 char *s, *d;
2117
2118 s = strdup(item->source);
2119 if (!s) {
2120 r = -ENOMEM;
2121 goto finish;
2122 }
2123
2124 d = strdup(item->destination);
2125 if (!d) {
2126 free(s);
2127 r = -ENOMEM;
2128 goto finish;
2129 }
2130
2131 bind_mounts[h++] = (BindMount) {
2132 .source = s,
2133 .destination = d,
2134 .read_only = item->read_only,
2135 .recursive = item->recursive,
2136 .ignore_enoent = item->ignore_enoent,
2137 };
2138 }
2139
2140 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2141 char **suffix;
2142
2143 if (!params->prefix[t])
2144 continue;
2145
2146 if (strv_isempty(context->directories[t].paths))
2147 continue;
2148
2149 if (context->dynamic_user &&
2150 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2151 char *private_root;
2152
2153 /* So this is for a dynamic user, and we need to make sure the process can access its own
2154 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2155 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2156
2157 private_root = strjoin(params->prefix[t], "/private");
2158 if (!private_root) {
2159 r = -ENOMEM;
2160 goto finish;
2161 }
2162
2163 r = strv_consume(&empty_directories, private_root);
2164 if (r < 0) {
2165 r = -ENOMEM;
2166 goto finish;
2167 }
2168 }
2169
2170 STRV_FOREACH(suffix, context->directories[t].paths) {
2171 char *s, *d;
2172
2173 if (context->dynamic_user &&
2174 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2175 s = strjoin(params->prefix[t], "/private/", *suffix);
2176 else
2177 s = strjoin(params->prefix[t], "/", *suffix);
2178 if (!s) {
2179 r = -ENOMEM;
2180 goto finish;
2181 }
2182
2183 d = strdup(s);
2184 if (!d) {
2185 free(s);
2186 r = -ENOMEM;
2187 goto finish;
2188 }
2189
2190 bind_mounts[h++] = (BindMount) {
2191 .source = s,
2192 .destination = d,
2193 .read_only = false,
2194 .recursive = true,
2195 .ignore_enoent = false,
2196 };
2197 }
2198 }
2199
2200 assert(h == n);
2201
2202 *ret_bind_mounts = bind_mounts;
2203 *ret_n_bind_mounts = n;
2204 *ret_empty_directories = empty_directories;
2205
2206 empty_directories = NULL;
2207
2208 return (int) n;
2209
2210 finish:
2211 bind_mount_free_many(bind_mounts, h);
2212 return r;
2213 }
2214
2215 static int apply_mount_namespace(
2216 Unit *u,
2217 ExecCommand *command,
2218 const ExecContext *context,
2219 const ExecParameters *params,
2220 ExecRuntime *runtime) {
2221
2222 _cleanup_strv_free_ char **empty_directories = NULL;
2223 char *tmp = NULL, *var = NULL;
2224 const char *root_dir = NULL, *root_image = NULL;
2225 NamespaceInfo ns_info = {
2226 .ignore_protect_paths = false,
2227 .private_dev = context->private_devices,
2228 .protect_control_groups = context->protect_control_groups,
2229 .protect_kernel_tunables = context->protect_kernel_tunables,
2230 .protect_kernel_modules = context->protect_kernel_modules,
2231 .mount_apivfs = context->mount_apivfs,
2232 };
2233 bool needs_sandboxing;
2234 BindMount *bind_mounts = NULL;
2235 unsigned n_bind_mounts = 0;
2236 int r;
2237
2238 assert(context);
2239
2240 /* The runtime struct only contains the parent of the private /tmp,
2241 * which is non-accessible to world users. Inside of it there's a /tmp
2242 * that is sticky, and that's the one we want to use here. */
2243
2244 if (context->private_tmp && runtime) {
2245 if (runtime->tmp_dir)
2246 tmp = strjoina(runtime->tmp_dir, "/tmp");
2247 if (runtime->var_tmp_dir)
2248 var = strjoina(runtime->var_tmp_dir, "/tmp");
2249 }
2250
2251 if (params->flags & EXEC_APPLY_CHROOT) {
2252 root_image = context->root_image;
2253
2254 if (!root_image)
2255 root_dir = context->root_directory;
2256 }
2257
2258 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2259 if (r < 0)
2260 return r;
2261
2262 /*
2263 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2264 * sandbox info, otherwise enforce it, don't ignore protected paths and
2265 * fail if we are enable to apply the sandbox inside the mount namespace.
2266 */
2267 if (!context->dynamic_user && root_dir)
2268 ns_info.ignore_protect_paths = true;
2269
2270 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2271
2272 r = setup_namespace(root_dir, root_image,
2273 &ns_info, context->read_write_paths,
2274 needs_sandboxing ? context->read_only_paths : NULL,
2275 needs_sandboxing ? context->inaccessible_paths : NULL,
2276 empty_directories,
2277 bind_mounts,
2278 n_bind_mounts,
2279 tmp,
2280 var,
2281 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2282 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2283 context->mount_flags,
2284 DISSECT_IMAGE_DISCARD_ON_LOOP);
2285
2286 bind_mount_free_many(bind_mounts, n_bind_mounts);
2287
2288 /* If we couldn't set up the namespace this is probably due to a
2289 * missing capability. In this case, silently proceeed. */
2290 if (IN_SET(r, -EPERM, -EACCES)) {
2291 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2292 return 0;
2293 }
2294
2295 return r;
2296 }
2297
2298 static int apply_working_directory(
2299 const ExecContext *context,
2300 const ExecParameters *params,
2301 const char *home,
2302 const bool needs_mount_ns,
2303 int *exit_status) {
2304
2305 const char *d, *wd;
2306
2307 assert(context);
2308 assert(exit_status);
2309
2310 if (context->working_directory_home) {
2311
2312 if (!home) {
2313 *exit_status = EXIT_CHDIR;
2314 return -ENXIO;
2315 }
2316
2317 wd = home;
2318
2319 } else if (context->working_directory)
2320 wd = context->working_directory;
2321 else
2322 wd = "/";
2323
2324 if (params->flags & EXEC_APPLY_CHROOT) {
2325 if (!needs_mount_ns && context->root_directory)
2326 if (chroot(context->root_directory) < 0) {
2327 *exit_status = EXIT_CHROOT;
2328 return -errno;
2329 }
2330
2331 d = wd;
2332 } else
2333 d = prefix_roota(context->root_directory, wd);
2334
2335 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2336 *exit_status = EXIT_CHDIR;
2337 return -errno;
2338 }
2339
2340 return 0;
2341 }
2342
2343 static int setup_keyring(
2344 Unit *u,
2345 const ExecContext *context,
2346 const ExecParameters *p,
2347 uid_t uid, gid_t gid) {
2348
2349 key_serial_t keyring;
2350 int r;
2351
2352 assert(u);
2353 assert(context);
2354 assert(p);
2355
2356 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2357 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2358 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2359 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2360 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2361 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2362
2363 if (!(p->flags & EXEC_NEW_KEYRING))
2364 return 0;
2365
2366 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2367 return 0;
2368
2369 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2370 if (keyring == -1) {
2371 if (errno == ENOSYS)
2372 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2373 else if (IN_SET(errno, EACCES, EPERM))
2374 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2375 else if (errno == EDQUOT)
2376 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2377 else
2378 return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2379
2380 return 0;
2381 }
2382
2383 /* Populate they keyring with the invocation ID by default. */
2384 if (!sd_id128_is_null(u->invocation_id)) {
2385 key_serial_t key;
2386
2387 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2388 if (key == -1)
2389 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2390 else {
2391 if (keyctl(KEYCTL_SETPERM, key,
2392 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2393 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2394 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2395 }
2396 }
2397
2398 /* And now, make the keyring owned by the service's user */
2399 if (uid_is_valid(uid) || gid_is_valid(gid))
2400 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2401 return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2402
2403 /* When requested link the user keyring into the session keyring. */
2404 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2405 uid_t saved_uid;
2406 gid_t saved_gid;
2407
2408 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2409 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2410 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2411
2412 saved_uid = getuid();
2413 saved_gid = getgid();
2414
2415 if (gid_is_valid(gid) && gid != saved_gid) {
2416 if (setregid(gid, -1) < 0)
2417 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2418 }
2419
2420 if (uid_is_valid(uid) && uid != saved_uid) {
2421 if (setreuid(uid, -1) < 0) {
2422 (void) setregid(saved_gid, -1);
2423 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2424 }
2425 }
2426
2427 if (keyctl(KEYCTL_LINK,
2428 KEY_SPEC_USER_KEYRING,
2429 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2430
2431 r = -errno;
2432
2433 (void) setreuid(saved_uid, -1);
2434 (void) setregid(saved_gid, -1);
2435
2436 return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2437 }
2438
2439 if (uid_is_valid(uid) && uid != saved_uid) {
2440 if (setreuid(saved_uid, -1) < 0) {
2441 (void) setregid(saved_gid, -1);
2442 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2443 }
2444 }
2445
2446 if (gid_is_valid(gid) && gid != saved_gid) {
2447 if (setregid(saved_gid, -1) < 0)
2448 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2449 }
2450 }
2451
2452 return 0;
2453 }
2454
2455 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2456 assert(array);
2457 assert(n);
2458
2459 if (!pair)
2460 return;
2461
2462 if (pair[0] >= 0)
2463 array[(*n)++] = pair[0];
2464 if (pair[1] >= 0)
2465 array[(*n)++] = pair[1];
2466 }
2467
2468 static int close_remaining_fds(
2469 const ExecParameters *params,
2470 ExecRuntime *runtime,
2471 DynamicCreds *dcreds,
2472 int user_lookup_fd,
2473 int socket_fd,
2474 int *fds, unsigned n_fds) {
2475
2476 unsigned n_dont_close = 0;
2477 int dont_close[n_fds + 12];
2478
2479 assert(params);
2480
2481 if (params->stdin_fd >= 0)
2482 dont_close[n_dont_close++] = params->stdin_fd;
2483 if (params->stdout_fd >= 0)
2484 dont_close[n_dont_close++] = params->stdout_fd;
2485 if (params->stderr_fd >= 0)
2486 dont_close[n_dont_close++] = params->stderr_fd;
2487
2488 if (socket_fd >= 0)
2489 dont_close[n_dont_close++] = socket_fd;
2490 if (n_fds > 0) {
2491 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2492 n_dont_close += n_fds;
2493 }
2494
2495 if (runtime)
2496 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2497
2498 if (dcreds) {
2499 if (dcreds->user)
2500 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2501 if (dcreds->group)
2502 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2503 }
2504
2505 if (user_lookup_fd >= 0)
2506 dont_close[n_dont_close++] = user_lookup_fd;
2507
2508 return close_all_fds(dont_close, n_dont_close);
2509 }
2510
2511 static int send_user_lookup(
2512 Unit *unit,
2513 int user_lookup_fd,
2514 uid_t uid,
2515 gid_t gid) {
2516
2517 assert(unit);
2518
2519 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2520 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2521 * specified. */
2522
2523 if (user_lookup_fd < 0)
2524 return 0;
2525
2526 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2527 return 0;
2528
2529 if (writev(user_lookup_fd,
2530 (struct iovec[]) {
2531 IOVEC_INIT(&uid, sizeof(uid)),
2532 IOVEC_INIT(&gid, sizeof(gid)),
2533 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2534 return -errno;
2535
2536 return 0;
2537 }
2538
2539 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2540 int r;
2541
2542 assert(c);
2543 assert(home);
2544 assert(buf);
2545
2546 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2547
2548 if (*home)
2549 return 0;
2550
2551 if (!c->working_directory_home)
2552 return 0;
2553
2554 if (uid == 0) {
2555 /* Hardcode /root as home directory for UID 0 */
2556 *home = "/root";
2557 return 1;
2558 }
2559
2560 r = get_home_dir(buf);
2561 if (r < 0)
2562 return r;
2563
2564 *home = *buf;
2565 return 1;
2566 }
2567
2568 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2569 _cleanup_strv_free_ char ** list = NULL;
2570 ExecDirectoryType t;
2571 int r;
2572
2573 assert(c);
2574 assert(p);
2575 assert(ret);
2576
2577 assert(c->dynamic_user);
2578
2579 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2580 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2581 * directories. */
2582
2583 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2584 char **i;
2585
2586 if (t == EXEC_DIRECTORY_CONFIGURATION)
2587 continue;
2588
2589 if (!p->prefix[t])
2590 continue;
2591
2592 STRV_FOREACH(i, c->directories[t].paths) {
2593 char *e;
2594
2595 if (t == EXEC_DIRECTORY_RUNTIME)
2596 e = strjoin(p->prefix[t], "/", *i);
2597 else
2598 e = strjoin(p->prefix[t], "/private/", *i);
2599 if (!e)
2600 return -ENOMEM;
2601
2602 r = strv_consume(&list, e);
2603 if (r < 0)
2604 return r;
2605 }
2606 }
2607
2608 *ret = list;
2609 list = NULL;
2610
2611 return 0;
2612 }
2613
2614 static int exec_child(
2615 Unit *unit,
2616 ExecCommand *command,
2617 const ExecContext *context,
2618 const ExecParameters *params,
2619 ExecRuntime *runtime,
2620 DynamicCreds *dcreds,
2621 char **argv,
2622 int socket_fd,
2623 int named_iofds[3],
2624 int *fds,
2625 unsigned n_storage_fds,
2626 unsigned n_socket_fds,
2627 char **files_env,
2628 int user_lookup_fd,
2629 int *exit_status) {
2630
2631 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2632 _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2633 _cleanup_free_ gid_t *supplementary_gids = NULL;
2634 const char *username = NULL, *groupname = NULL;
2635 const char *home = NULL, *shell = NULL;
2636 dev_t journal_stream_dev = 0;
2637 ino_t journal_stream_ino = 0;
2638 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2639 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2640 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2641 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2642 #if HAVE_SELINUX
2643 bool use_selinux = false;
2644 #endif
2645 #if ENABLE_SMACK
2646 bool use_smack = false;
2647 #endif
2648 #if HAVE_APPARMOR
2649 bool use_apparmor = false;
2650 #endif
2651 uid_t uid = UID_INVALID;
2652 gid_t gid = GID_INVALID;
2653 int i, r, ngids = 0;
2654 unsigned n_fds;
2655 ExecDirectoryType dt;
2656 int secure_bits;
2657
2658 assert(unit);
2659 assert(command);
2660 assert(context);
2661 assert(params);
2662 assert(exit_status);
2663
2664 rename_process_from_path(command->path);
2665
2666 /* We reset exactly these signals, since they are the
2667 * only ones we set to SIG_IGN in the main daemon. All
2668 * others we leave untouched because we set them to
2669 * SIG_DFL or a valid handler initially, both of which
2670 * will be demoted to SIG_DFL. */
2671 (void) default_signals(SIGNALS_CRASH_HANDLER,
2672 SIGNALS_IGNORE, -1);
2673
2674 if (context->ignore_sigpipe)
2675 (void) ignore_signals(SIGPIPE, -1);
2676
2677 r = reset_signal_mask();
2678 if (r < 0) {
2679 *exit_status = EXIT_SIGNAL_MASK;
2680 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2681 }
2682
2683 if (params->idle_pipe)
2684 do_idle_pipe_dance(params->idle_pipe);
2685
2686 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2687 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2688 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2689 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2690
2691 log_forget_fds();
2692 log_set_open_when_needed(true);
2693
2694 /* In case anything used libc syslog(), close this here, too */
2695 closelog();
2696
2697 n_fds = n_storage_fds + n_socket_fds;
2698 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2699 if (r < 0) {
2700 *exit_status = EXIT_FDS;
2701 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2702 }
2703
2704 if (!context->same_pgrp)
2705 if (setsid() < 0) {
2706 *exit_status = EXIT_SETSID;
2707 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2708 }
2709
2710 exec_context_tty_reset(context, params);
2711
2712 if (unit_shall_confirm_spawn(unit)) {
2713 const char *vc = params->confirm_spawn;
2714 _cleanup_free_ char *cmdline = NULL;
2715
2716 cmdline = exec_command_line(argv);
2717 if (!cmdline) {
2718 *exit_status = EXIT_MEMORY;
2719 return log_oom();
2720 }
2721
2722 r = ask_for_confirmation(vc, unit, cmdline);
2723 if (r != CONFIRM_EXECUTE) {
2724 if (r == CONFIRM_PRETEND_SUCCESS) {
2725 *exit_status = EXIT_SUCCESS;
2726 return 0;
2727 }
2728 *exit_status = EXIT_CONFIRM;
2729 log_unit_error(unit, "Execution cancelled by the user");
2730 return -ECANCELED;
2731 }
2732 }
2733
2734 if (context->dynamic_user && dcreds) {
2735 _cleanup_strv_free_ char **suggested_paths = NULL;
2736
2737 /* Make sure we bypass our own NSS module for any NSS checks */
2738 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2739 *exit_status = EXIT_USER;
2740 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2741 }
2742
2743 r = compile_suggested_paths(context, params, &suggested_paths);
2744 if (r < 0) {
2745 *exit_status = EXIT_MEMORY;
2746 return log_oom();
2747 }
2748
2749 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2750 if (r < 0) {
2751 *exit_status = EXIT_USER;
2752 if (r == -EILSEQ) {
2753 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2754 return -EOPNOTSUPP;
2755 }
2756 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2757 }
2758
2759 if (!uid_is_valid(uid)) {
2760 *exit_status = EXIT_USER;
2761 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2762 return -ESRCH;
2763 }
2764
2765 if (!gid_is_valid(gid)) {
2766 *exit_status = EXIT_USER;
2767 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2768 return -ESRCH;
2769 }
2770
2771 if (dcreds->user)
2772 username = dcreds->user->name;
2773
2774 } else {
2775 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2776 if (r < 0) {
2777 *exit_status = EXIT_USER;
2778 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2779 }
2780
2781 r = get_fixed_group(context, &groupname, &gid);
2782 if (r < 0) {
2783 *exit_status = EXIT_GROUP;
2784 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2785 }
2786 }
2787
2788 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2789 r = get_supplementary_groups(context, username, groupname, gid,
2790 &supplementary_gids, &ngids);
2791 if (r < 0) {
2792 *exit_status = EXIT_GROUP;
2793 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2794 }
2795
2796 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2797 if (r < 0) {
2798 *exit_status = EXIT_USER;
2799 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2800 }
2801
2802 user_lookup_fd = safe_close(user_lookup_fd);
2803
2804 r = acquire_home(context, uid, &home, &home_buffer);
2805 if (r < 0) {
2806 *exit_status = EXIT_CHDIR;
2807 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2808 }
2809
2810 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2811 * must sure to drop O_NONBLOCK */
2812 if (socket_fd >= 0)
2813 (void) fd_nonblock(socket_fd, false);
2814
2815 r = setup_input(context, params, socket_fd, named_iofds);
2816 if (r < 0) {
2817 *exit_status = EXIT_STDIN;
2818 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2819 }
2820
2821 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2822 if (r < 0) {
2823 *exit_status = EXIT_STDOUT;
2824 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2825 }
2826
2827 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2828 if (r < 0) {
2829 *exit_status = EXIT_STDERR;
2830 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2831 }
2832
2833 if (params->cgroup_path) {
2834 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2835 if (r < 0) {
2836 *exit_status = EXIT_CGROUP;
2837 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2838 }
2839 }
2840
2841 if (context->oom_score_adjust_set) {
2842 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2843
2844 /* When we can't make this change due to EPERM, then
2845 * let's silently skip over it. User namespaces
2846 * prohibit write access to this file, and we
2847 * shouldn't trip up over that. */
2848
2849 sprintf(t, "%i", context->oom_score_adjust);
2850 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2851 if (IN_SET(r, -EPERM, -EACCES))
2852 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2853 else if (r < 0) {
2854 *exit_status = EXIT_OOM_ADJUST;
2855 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2856 }
2857 }
2858
2859 if (context->nice_set)
2860 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2861 *exit_status = EXIT_NICE;
2862 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2863 }
2864
2865 if (context->cpu_sched_set) {
2866 struct sched_param param = {
2867 .sched_priority = context->cpu_sched_priority,
2868 };
2869
2870 r = sched_setscheduler(0,
2871 context->cpu_sched_policy |
2872 (context->cpu_sched_reset_on_fork ?
2873 SCHED_RESET_ON_FORK : 0),
2874 &param);
2875 if (r < 0) {
2876 *exit_status = EXIT_SETSCHEDULER;
2877 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2878 }
2879 }
2880
2881 if (context->cpuset)
2882 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2883 *exit_status = EXIT_CPUAFFINITY;
2884 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2885 }
2886
2887 if (context->ioprio_set)
2888 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2889 *exit_status = EXIT_IOPRIO;
2890 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2891 }
2892
2893 if (context->timer_slack_nsec != NSEC_INFINITY)
2894 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2895 *exit_status = EXIT_TIMERSLACK;
2896 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2897 }
2898
2899 if (context->personality != PERSONALITY_INVALID) {
2900 r = safe_personality(context->personality);
2901 if (r < 0) {
2902 *exit_status = EXIT_PERSONALITY;
2903 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2904 }
2905 }
2906
2907 if (context->utmp_id)
2908 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2909 context->tty_path,
2910 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
2911 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2912 USER_PROCESS,
2913 username);
2914
2915 if (context->user) {
2916 r = chown_terminal(STDIN_FILENO, uid);
2917 if (r < 0) {
2918 *exit_status = EXIT_STDIN;
2919 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2920 }
2921 }
2922
2923 /* If delegation is enabled we'll pass ownership of the cgroup
2924 * (but only in systemd's own controller hierarchy!) to the
2925 * user of the new process. */
2926 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2927 r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
2928 if (r < 0) {
2929 *exit_status = EXIT_CGROUP;
2930 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2931 }
2932
2933 r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
2934 if (r < 0) {
2935 *exit_status = EXIT_CGROUP;
2936 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2937 }
2938 }
2939
2940 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2941 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
2942 if (r < 0)
2943 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
2944 }
2945
2946 r = build_environment(
2947 unit,
2948 context,
2949 params,
2950 n_fds,
2951 home,
2952 username,
2953 shell,
2954 journal_stream_dev,
2955 journal_stream_ino,
2956 &our_env);
2957 if (r < 0) {
2958 *exit_status = EXIT_MEMORY;
2959 return log_oom();
2960 }
2961
2962 r = build_pass_environment(context, &pass_env);
2963 if (r < 0) {
2964 *exit_status = EXIT_MEMORY;
2965 return log_oom();
2966 }
2967
2968 accum_env = strv_env_merge(5,
2969 params->environment,
2970 our_env,
2971 pass_env,
2972 context->environment,
2973 files_env,
2974 NULL);
2975 if (!accum_env) {
2976 *exit_status = EXIT_MEMORY;
2977 return log_oom();
2978 }
2979 accum_env = strv_env_clean(accum_env);
2980
2981 (void) umask(context->umask);
2982
2983 r = setup_keyring(unit, context, params, uid, gid);
2984 if (r < 0) {
2985 *exit_status = EXIT_KEYRING;
2986 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
2987 }
2988
2989 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2990 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2991
2992 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2993 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
2994
2995 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2996 if (needs_ambient_hack)
2997 needs_setuid = false;
2998 else
2999 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3000
3001 if (needs_sandboxing) {
3002 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3003 * present. The actual MAC context application will happen later, as late as possible, to avoid
3004 * impacting our own code paths. */
3005
3006 #if HAVE_SELINUX
3007 use_selinux = mac_selinux_use();
3008 #endif
3009 #if ENABLE_SMACK
3010 use_smack = mac_smack_use();
3011 #endif
3012 #if HAVE_APPARMOR
3013 use_apparmor = mac_apparmor_use();
3014 #endif
3015 }
3016
3017 if (needs_setuid) {
3018 if (context->pam_name && username) {
3019 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3020 if (r < 0) {
3021 *exit_status = EXIT_PAM;
3022 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3023 }
3024 }
3025 }
3026
3027 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3028 if (ns_type_supported(NAMESPACE_NET)) {
3029 r = setup_netns(runtime->netns_storage_socket);
3030 if (r < 0) {
3031 *exit_status = EXIT_NETWORK;
3032 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3033 }
3034 } else
3035 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3036 }
3037
3038 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3039 if (needs_mount_namespace) {
3040 r = apply_mount_namespace(unit, command, context, params, runtime);
3041 if (r < 0) {
3042 *exit_status = EXIT_NAMESPACE;
3043 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3044 }
3045 }
3046
3047 /* Apply just after mount namespace setup */
3048 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3049 if (r < 0)
3050 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3051
3052 /* Drop groups as early as possbile */
3053 if (needs_setuid) {
3054 r = enforce_groups(gid, supplementary_gids, ngids);
3055 if (r < 0) {
3056 *exit_status = EXIT_GROUP;
3057 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3058 }
3059 }
3060
3061 if (needs_sandboxing) {
3062 #if HAVE_SELINUX
3063 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3064 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3065 if (r < 0) {
3066 *exit_status = EXIT_SELINUX_CONTEXT;
3067 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3068 }
3069 }
3070 #endif
3071
3072 if (context->private_users) {
3073 r = setup_private_users(uid, gid);
3074 if (r < 0) {
3075 *exit_status = EXIT_USER;
3076 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3077 }
3078 }
3079 }
3080
3081 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3082 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3083 * was needed to upload the policy and can now be closed as well. */
3084 r = close_all_fds(fds, n_fds);
3085 if (r >= 0)
3086 r = shift_fds(fds, n_fds);
3087 if (r >= 0)
3088 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3089 if (r < 0) {
3090 *exit_status = EXIT_FDS;
3091 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3092 }
3093
3094 secure_bits = context->secure_bits;
3095
3096 if (needs_sandboxing) {
3097 uint64_t bset;
3098
3099 for (i = 0; i < _RLIMIT_MAX; i++) {
3100
3101 if (!context->rlimit[i])
3102 continue;
3103
3104 r = setrlimit_closest(i, context->rlimit[i]);
3105 if (r < 0) {
3106 *exit_status = EXIT_LIMITS;
3107 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3108 }
3109 }
3110
3111 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3112 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3113 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3114 *exit_status = EXIT_LIMITS;
3115 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3116 }
3117 }
3118
3119 bset = context->capability_bounding_set;
3120 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3121 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3122 * instead of us doing that */
3123 if (needs_ambient_hack)
3124 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3125 (UINT64_C(1) << CAP_SETUID) |
3126 (UINT64_C(1) << CAP_SETGID);
3127
3128 if (!cap_test_all(bset)) {
3129 r = capability_bounding_set_drop(bset, false);
3130 if (r < 0) {
3131 *exit_status = EXIT_CAPABILITIES;
3132 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3133 }
3134 }
3135
3136 /* This is done before enforce_user, but ambient set
3137 * does not survive over setresuid() if keep_caps is not set. */
3138 if (!needs_ambient_hack &&
3139 context->capability_ambient_set != 0) {
3140 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3141 if (r < 0) {
3142 *exit_status = EXIT_CAPABILITIES;
3143 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3144 }
3145 }
3146 }
3147
3148 if (needs_setuid) {
3149 if (context->user) {
3150 r = enforce_user(context, uid);
3151 if (r < 0) {
3152 *exit_status = EXIT_USER;
3153 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3154 }
3155
3156 if (!needs_ambient_hack &&
3157 context->capability_ambient_set != 0) {
3158
3159 /* Fix the ambient capabilities after user change. */
3160 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3161 if (r < 0) {
3162 *exit_status = EXIT_CAPABILITIES;
3163 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3164 }
3165
3166 /* If we were asked to change user and ambient capabilities
3167 * were requested, we had to add keep-caps to the securebits
3168 * so that we would maintain the inherited capability set
3169 * through the setresuid(). Make sure that the bit is added
3170 * also to the context secure_bits so that we don't try to
3171 * drop the bit away next. */
3172
3173 secure_bits |= 1<<SECURE_KEEP_CAPS;
3174 }
3175 }
3176 }
3177
3178 if (needs_sandboxing) {
3179 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3180 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3181 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3182 * are restricted. */
3183
3184 #if HAVE_SELINUX
3185 if (use_selinux) {
3186 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3187
3188 if (exec_context) {
3189 r = setexeccon(exec_context);
3190 if (r < 0) {
3191 *exit_status = EXIT_SELINUX_CONTEXT;
3192 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3193 }
3194 }
3195 }
3196 #endif
3197
3198 #if ENABLE_SMACK
3199 if (use_smack) {
3200 r = setup_smack(context, command);
3201 if (r < 0) {
3202 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3203 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3204 }
3205 }
3206 #endif
3207
3208 #if HAVE_APPARMOR
3209 if (use_apparmor && context->apparmor_profile) {
3210 r = aa_change_onexec(context->apparmor_profile);
3211 if (r < 0 && !context->apparmor_profile_ignore) {
3212 *exit_status = EXIT_APPARMOR_PROFILE;
3213 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3214 }
3215 }
3216 #endif
3217
3218 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3219 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3220 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3221 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3222 *exit_status = EXIT_SECUREBITS;
3223 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3224 }
3225
3226 if (context_has_no_new_privileges(context))
3227 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3228 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3229 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3230 }
3231
3232 #if HAVE_SECCOMP
3233 r = apply_address_families(unit, context);
3234 if (r < 0) {
3235 *exit_status = EXIT_ADDRESS_FAMILIES;
3236 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3237 }
3238
3239 r = apply_memory_deny_write_execute(unit, context);
3240 if (r < 0) {
3241 *exit_status = EXIT_SECCOMP;
3242 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3243 }
3244
3245 r = apply_restrict_realtime(unit, context);
3246 if (r < 0) {
3247 *exit_status = EXIT_SECCOMP;
3248 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3249 }
3250
3251 r = apply_restrict_namespaces(unit, context);
3252 if (r < 0) {
3253 *exit_status = EXIT_SECCOMP;
3254 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3255 }
3256
3257 r = apply_protect_sysctl(unit, context);
3258 if (r < 0) {
3259 *exit_status = EXIT_SECCOMP;
3260 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3261 }
3262
3263 r = apply_protect_kernel_modules(unit, context);
3264 if (r < 0) {
3265 *exit_status = EXIT_SECCOMP;
3266 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3267 }
3268
3269 r = apply_private_devices(unit, context);
3270 if (r < 0) {
3271 *exit_status = EXIT_SECCOMP;
3272 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3273 }
3274
3275 r = apply_syscall_archs(unit, context);
3276 if (r < 0) {
3277 *exit_status = EXIT_SECCOMP;
3278 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3279 }
3280
3281 r = apply_lock_personality(unit, context);
3282 if (r < 0) {
3283 *exit_status = EXIT_SECCOMP;
3284 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3285 }
3286
3287 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3288 * by the filter as little as possible. */
3289 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3290 if (r < 0) {
3291 *exit_status = EXIT_SECCOMP;
3292 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3293 }
3294 #endif
3295 }
3296
3297 if (!strv_isempty(context->unset_environment)) {
3298 char **ee = NULL;
3299
3300 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3301 if (!ee) {
3302 *exit_status = EXIT_MEMORY;
3303 return log_oom();
3304 }
3305
3306 strv_free(accum_env);
3307 accum_env = ee;
3308 }
3309
3310 final_argv = replace_env_argv(argv, accum_env);
3311 if (!final_argv) {
3312 *exit_status = EXIT_MEMORY;
3313 return log_oom();
3314 }
3315
3316 if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3317 _cleanup_free_ char *line;
3318
3319 line = exec_command_line(final_argv);
3320 if (line) {
3321 log_struct(LOG_DEBUG,
3322 "EXECUTABLE=%s", command->path,
3323 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3324 LOG_UNIT_ID(unit),
3325 LOG_UNIT_INVOCATION_ID(unit),
3326 NULL);
3327 }
3328 }
3329
3330 execve(command->path, final_argv, accum_env);
3331
3332 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3333
3334 log_struct_errno(LOG_INFO, errno,
3335 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3336 LOG_UNIT_ID(unit),
3337 LOG_UNIT_INVOCATION_ID(unit),
3338 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3339 command->path),
3340 "EXECUTABLE=%s", command->path,
3341 NULL);
3342
3343 return 0;
3344 }
3345
3346 *exit_status = EXIT_EXEC;
3347 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3348 }
3349
3350 int exec_spawn(Unit *unit,
3351 ExecCommand *command,
3352 const ExecContext *context,
3353 const ExecParameters *params,
3354 ExecRuntime *runtime,
3355 DynamicCreds *dcreds,
3356 pid_t *ret) {
3357
3358 _cleanup_strv_free_ char **files_env = NULL;
3359 int *fds = NULL;
3360 unsigned n_storage_fds = 0, n_socket_fds = 0;
3361 _cleanup_free_ char *line = NULL;
3362 int socket_fd, r;
3363 int named_iofds[3] = { -1, -1, -1 };
3364 char **argv;
3365 pid_t pid;
3366
3367 assert(unit);
3368 assert(command);
3369 assert(context);
3370 assert(ret);
3371 assert(params);
3372 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3373
3374 if (context->std_input == EXEC_INPUT_SOCKET ||
3375 context->std_output == EXEC_OUTPUT_SOCKET ||
3376 context->std_error == EXEC_OUTPUT_SOCKET) {
3377
3378 if (params->n_socket_fds > 1) {
3379 log_unit_error(unit, "Got more than one socket.");
3380 return -EINVAL;
3381 }
3382
3383 if (params->n_socket_fds == 0) {
3384 log_unit_error(unit, "Got no socket.");
3385 return -EINVAL;
3386 }
3387
3388 socket_fd = params->fds[0];
3389 } else {
3390 socket_fd = -1;
3391 fds = params->fds;
3392 n_storage_fds = params->n_storage_fds;
3393 n_socket_fds = params->n_socket_fds;
3394 }
3395
3396 r = exec_context_named_iofds(unit, context, params, named_iofds);
3397 if (r < 0)
3398 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3399
3400 r = exec_context_load_environment(unit, context, &files_env);
3401 if (r < 0)
3402 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3403
3404 argv = params->argv ?: command->argv;
3405 line = exec_command_line(argv);
3406 if (!line)
3407 return log_oom();
3408
3409 log_struct(LOG_DEBUG,
3410 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3411 "EXECUTABLE=%s", command->path,
3412 LOG_UNIT_ID(unit),
3413 LOG_UNIT_INVOCATION_ID(unit),
3414 NULL);
3415
3416 pid = fork();
3417 if (pid < 0)
3418 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3419
3420 if (pid == 0) {
3421 int exit_status = EXIT_SUCCESS;
3422
3423 r = exec_child(unit,
3424 command,
3425 context,
3426 params,
3427 runtime,
3428 dcreds,
3429 argv,
3430 socket_fd,
3431 named_iofds,
3432 fds,
3433 n_storage_fds,
3434 n_socket_fds,
3435 files_env,
3436 unit->manager->user_lookup_fds[1],
3437 &exit_status);
3438
3439 if (r < 0) {
3440 log_struct_errno(LOG_ERR, r,
3441 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3442 LOG_UNIT_ID(unit),
3443 LOG_UNIT_INVOCATION_ID(unit),
3444 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3445 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3446 command->path),
3447 "EXECUTABLE=%s", command->path,
3448 NULL);
3449 }
3450
3451 _exit(exit_status);
3452 }
3453
3454 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3455
3456 /* We add the new process to the cgroup both in the child (so
3457 * that we can be sure that no user code is ever executed
3458 * outside of the cgroup) and in the parent (so that we can be
3459 * sure that when we kill the cgroup the process will be
3460 * killed too). */
3461 if (params->cgroup_path)
3462 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3463
3464 exec_status_start(&command->exec_status, pid);
3465
3466 *ret = pid;
3467 return 0;
3468 }
3469
3470 void exec_context_init(ExecContext *c) {
3471 ExecDirectoryType i;
3472
3473 assert(c);
3474
3475 c->umask = 0022;
3476 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3477 c->cpu_sched_policy = SCHED_OTHER;
3478 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3479 c->syslog_level_prefix = true;
3480 c->ignore_sigpipe = true;
3481 c->timer_slack_nsec = NSEC_INFINITY;
3482 c->personality = PERSONALITY_INVALID;
3483 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3484 c->directories[i].mode = 0755;
3485 c->capability_bounding_set = CAP_ALL;
3486 c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3487 c->log_level_max = -1;
3488 }
3489
3490 void exec_context_done(ExecContext *c) {
3491 ExecDirectoryType i;
3492 size_t l;
3493
3494 assert(c);
3495
3496 c->environment = strv_free(c->environment);
3497 c->environment_files = strv_free(c->environment_files);
3498 c->pass_environment = strv_free(c->pass_environment);
3499 c->unset_environment = strv_free(c->unset_environment);
3500
3501 for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3502 c->rlimit[l] = mfree(c->rlimit[l]);
3503
3504 for (l = 0; l < 3; l++)
3505 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3506
3507 c->working_directory = mfree(c->working_directory);
3508 c->root_directory = mfree(c->root_directory);
3509 c->root_image = mfree(c->root_image);
3510 c->tty_path = mfree(c->tty_path);
3511 c->syslog_identifier = mfree(c->syslog_identifier);
3512 c->user = mfree(c->user);
3513 c->group = mfree(c->group);
3514
3515 c->supplementary_groups = strv_free(c->supplementary_groups);
3516
3517 c->pam_name = mfree(c->pam_name);
3518
3519 c->read_only_paths = strv_free(c->read_only_paths);
3520 c->read_write_paths = strv_free(c->read_write_paths);
3521 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3522
3523 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3524
3525 if (c->cpuset)
3526 CPU_FREE(c->cpuset);
3527
3528 c->utmp_id = mfree(c->utmp_id);
3529 c->selinux_context = mfree(c->selinux_context);
3530 c->apparmor_profile = mfree(c->apparmor_profile);
3531 c->smack_process_label = mfree(c->smack_process_label);
3532
3533 c->syscall_filter = hashmap_free(c->syscall_filter);
3534 c->syscall_archs = set_free(c->syscall_archs);
3535 c->address_families = set_free(c->address_families);
3536
3537 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3538 c->directories[i].paths = strv_free(c->directories[i].paths);
3539
3540 c->log_level_max = -1;
3541
3542 exec_context_free_log_extra_fields(c);
3543 }
3544
3545 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3546 char **i;
3547
3548 assert(c);
3549
3550 if (!runtime_prefix)
3551 return 0;
3552
3553 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3554 _cleanup_free_ char *p;
3555
3556 p = strjoin(runtime_prefix, "/", *i);
3557 if (!p)
3558 return -ENOMEM;
3559
3560 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3561 * next. */
3562 (void) rm_rf(p, REMOVE_ROOT);
3563 }
3564
3565 return 0;
3566 }
3567
3568 void exec_command_done(ExecCommand *c) {
3569 assert(c);
3570
3571 c->path = mfree(c->path);
3572
3573 c->argv = strv_free(c->argv);
3574 }
3575
3576 void exec_command_done_array(ExecCommand *c, unsigned n) {
3577 unsigned i;
3578
3579 for (i = 0; i < n; i++)
3580 exec_command_done(c+i);
3581 }
3582
3583 ExecCommand* exec_command_free_list(ExecCommand *c) {
3584 ExecCommand *i;
3585
3586 while ((i = c)) {
3587 LIST_REMOVE(command, c, i);
3588 exec_command_done(i);
3589 free(i);
3590 }
3591
3592 return NULL;
3593 }
3594
3595 void exec_command_free_array(ExecCommand **c, unsigned n) {
3596 unsigned i;
3597
3598 for (i = 0; i < n; i++)
3599 c[i] = exec_command_free_list(c[i]);
3600 }
3601
3602 typedef struct InvalidEnvInfo {
3603 Unit *unit;
3604 const char *path;
3605 } InvalidEnvInfo;
3606
3607 static void invalid_env(const char *p, void *userdata) {
3608 InvalidEnvInfo *info = userdata;
3609
3610 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3611 }
3612
3613 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3614 assert(c);
3615
3616 switch (fd_index) {
3617 case STDIN_FILENO:
3618 if (c->std_input != EXEC_INPUT_NAMED_FD)
3619 return NULL;
3620 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3621 case STDOUT_FILENO:
3622 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3623 return NULL;
3624 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3625 case STDERR_FILENO:
3626 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3627 return NULL;
3628 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3629 default:
3630 return NULL;
3631 }
3632 }
3633
3634 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3635 unsigned i, targets;
3636 const char* stdio_fdname[3];
3637 unsigned n_fds;
3638
3639 assert(c);
3640 assert(p);
3641
3642 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3643 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3644 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3645
3646 for (i = 0; i < 3; i++)
3647 stdio_fdname[i] = exec_context_fdname(c, i);
3648
3649 n_fds = p->n_storage_fds + p->n_socket_fds;
3650
3651 for (i = 0; i < n_fds && targets > 0; i++)
3652 if (named_iofds[STDIN_FILENO] < 0 &&
3653 c->std_input == EXEC_INPUT_NAMED_FD &&
3654 stdio_fdname[STDIN_FILENO] &&
3655 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3656
3657 named_iofds[STDIN_FILENO] = p->fds[i];
3658 targets--;
3659
3660 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3661 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3662 stdio_fdname[STDOUT_FILENO] &&
3663 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3664
3665 named_iofds[STDOUT_FILENO] = p->fds[i];
3666 targets--;
3667
3668 } else if (named_iofds[STDERR_FILENO] < 0 &&
3669 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3670 stdio_fdname[STDERR_FILENO] &&
3671 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3672
3673 named_iofds[STDERR_FILENO] = p->fds[i];
3674 targets--;
3675 }
3676
3677 return targets == 0 ? 0 : -ENOENT;
3678 }
3679
3680 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3681 char **i, **r = NULL;
3682
3683 assert(c);
3684 assert(l);
3685
3686 STRV_FOREACH(i, c->environment_files) {
3687 char *fn;
3688 int k;
3689 unsigned n;
3690 bool ignore = false;
3691 char **p;
3692 _cleanup_globfree_ glob_t pglob = {};
3693
3694 fn = *i;
3695
3696 if (fn[0] == '-') {
3697 ignore = true;
3698 fn++;
3699 }
3700
3701 if (!path_is_absolute(fn)) {
3702 if (ignore)
3703 continue;
3704
3705 strv_free(r);
3706 return -EINVAL;
3707 }
3708
3709 /* Filename supports globbing, take all matching files */
3710 k = safe_glob(fn, 0, &pglob);
3711 if (k < 0) {
3712 if (ignore)
3713 continue;
3714
3715 strv_free(r);
3716 return k;
3717 }
3718
3719 /* When we don't match anything, -ENOENT should be returned */
3720 assert(pglob.gl_pathc > 0);
3721
3722 for (n = 0; n < pglob.gl_pathc; n++) {
3723 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3724 if (k < 0) {
3725 if (ignore)
3726 continue;
3727
3728 strv_free(r);
3729 return k;
3730 }
3731 /* Log invalid environment variables with filename */
3732 if (p) {
3733 InvalidEnvInfo info = {
3734 .unit = unit,
3735 .path = pglob.gl_pathv[n]
3736 };
3737
3738 p = strv_env_clean_with_callback(p, invalid_env, &info);
3739 }
3740
3741 if (r == NULL)
3742 r = p;
3743 else {
3744 char **m;
3745
3746 m = strv_env_merge(2, r, p);
3747 strv_free(r);
3748 strv_free(p);
3749 if (!m)
3750 return -ENOMEM;
3751
3752 r = m;
3753 }
3754 }
3755 }
3756
3757 *l = r;
3758
3759 return 0;
3760 }
3761
3762 static bool tty_may_match_dev_console(const char *tty) {
3763 _cleanup_free_ char *active = NULL;
3764 char *console;
3765
3766 if (!tty)
3767 return true;
3768
3769 tty = skip_dev_prefix(tty);
3770
3771 /* trivial identity? */
3772 if (streq(tty, "console"))
3773 return true;
3774
3775 console = resolve_dev_console(&active);
3776 /* if we could not resolve, assume it may */
3777 if (!console)
3778 return true;
3779
3780 /* "tty0" means the active VC, so it may be the same sometimes */
3781 return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3782 }
3783
3784 bool exec_context_may_touch_console(ExecContext *ec) {
3785
3786 return (ec->tty_reset ||
3787 ec->tty_vhangup ||
3788 ec->tty_vt_disallocate ||
3789 is_terminal_input(ec->std_input) ||
3790 is_terminal_output(ec->std_output) ||
3791 is_terminal_output(ec->std_error)) &&
3792 tty_may_match_dev_console(exec_context_tty_path(ec));
3793 }
3794
3795 static void strv_fprintf(FILE *f, char **l) {
3796 char **g;
3797
3798 assert(f);
3799
3800 STRV_FOREACH(g, l)
3801 fprintf(f, " %s", *g);
3802 }
3803
3804 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3805 ExecDirectoryType dt;
3806 char **e, **d;
3807 unsigned i;
3808 int r;
3809
3810 assert(c);
3811 assert(f);
3812
3813 prefix = strempty(prefix);
3814
3815 fprintf(f,
3816 "%sUMask: %04o\n"
3817 "%sWorkingDirectory: %s\n"
3818 "%sRootDirectory: %s\n"
3819 "%sNonBlocking: %s\n"
3820 "%sPrivateTmp: %s\n"
3821 "%sPrivateDevices: %s\n"
3822 "%sProtectKernelTunables: %s\n"
3823 "%sProtectKernelModules: %s\n"
3824 "%sProtectControlGroups: %s\n"
3825 "%sPrivateNetwork: %s\n"
3826 "%sPrivateUsers: %s\n"
3827 "%sProtectHome: %s\n"
3828 "%sProtectSystem: %s\n"
3829 "%sMountAPIVFS: %s\n"
3830 "%sIgnoreSIGPIPE: %s\n"
3831 "%sMemoryDenyWriteExecute: %s\n"
3832 "%sRestrictRealtime: %s\n"
3833 "%sKeyringMode: %s\n",
3834 prefix, c->umask,
3835 prefix, c->working_directory ? c->working_directory : "/",
3836 prefix, c->root_directory ? c->root_directory : "/",
3837 prefix, yes_no(c->non_blocking),
3838 prefix, yes_no(c->private_tmp),
3839 prefix, yes_no(c->private_devices),
3840 prefix, yes_no(c->protect_kernel_tunables),
3841 prefix, yes_no(c->protect_kernel_modules),
3842 prefix, yes_no(c->protect_control_groups),
3843 prefix, yes_no(c->private_network),
3844 prefix, yes_no(c->private_users),
3845 prefix, protect_home_to_string(c->protect_home),
3846 prefix, protect_system_to_string(c->protect_system),
3847 prefix, yes_no(c->mount_apivfs),
3848 prefix, yes_no(c->ignore_sigpipe),
3849 prefix, yes_no(c->memory_deny_write_execute),
3850 prefix, yes_no(c->restrict_realtime),
3851 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3852
3853 if (c->root_image)
3854 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3855
3856 STRV_FOREACH(e, c->environment)
3857 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3858
3859 STRV_FOREACH(e, c->environment_files)
3860 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3861
3862 STRV_FOREACH(e, c->pass_environment)
3863 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3864
3865 STRV_FOREACH(e, c->unset_environment)
3866 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3867
3868 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3869
3870 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3871 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3872
3873 STRV_FOREACH(d, c->directories[dt].paths)
3874 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3875 }
3876
3877 if (c->nice_set)
3878 fprintf(f,
3879 "%sNice: %i\n",
3880 prefix, c->nice);
3881
3882 if (c->oom_score_adjust_set)
3883 fprintf(f,
3884 "%sOOMScoreAdjust: %i\n",
3885 prefix, c->oom_score_adjust);
3886
3887 for (i = 0; i < RLIM_NLIMITS; i++)
3888 if (c->rlimit[i]) {
3889 fprintf(f, "%s%s: " RLIM_FMT "\n",
3890 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3891 fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3892 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3893 }
3894
3895 if (c->ioprio_set) {
3896 _cleanup_free_ char *class_str = NULL;
3897
3898 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3899 if (r >= 0)
3900 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3901
3902 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3903 }
3904
3905 if (c->cpu_sched_set) {
3906 _cleanup_free_ char *policy_str = NULL;
3907
3908 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3909 if (r >= 0)
3910 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3911
3912 fprintf(f,
3913 "%sCPUSchedulingPriority: %i\n"
3914 "%sCPUSchedulingResetOnFork: %s\n",
3915 prefix, c->cpu_sched_priority,
3916 prefix, yes_no(c->cpu_sched_reset_on_fork));
3917 }
3918
3919 if (c->cpuset) {
3920 fprintf(f, "%sCPUAffinity:", prefix);
3921 for (i = 0; i < c->cpuset_ncpus; i++)
3922 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
3923 fprintf(f, " %u", i);
3924 fputs("\n", f);
3925 }
3926
3927 if (c->timer_slack_nsec != NSEC_INFINITY)
3928 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
3929
3930 fprintf(f,
3931 "%sStandardInput: %s\n"
3932 "%sStandardOutput: %s\n"
3933 "%sStandardError: %s\n",
3934 prefix, exec_input_to_string(c->std_input),
3935 prefix, exec_output_to_string(c->std_output),
3936 prefix, exec_output_to_string(c->std_error));
3937
3938 if (c->tty_path)
3939 fprintf(f,
3940 "%sTTYPath: %s\n"
3941 "%sTTYReset: %s\n"
3942 "%sTTYVHangup: %s\n"
3943 "%sTTYVTDisallocate: %s\n",
3944 prefix, c->tty_path,
3945 prefix, yes_no(c->tty_reset),
3946 prefix, yes_no(c->tty_vhangup),
3947 prefix, yes_no(c->tty_vt_disallocate));
3948
3949 if (IN_SET(c->std_output,
3950 EXEC_OUTPUT_SYSLOG,
3951 EXEC_OUTPUT_KMSG,
3952 EXEC_OUTPUT_JOURNAL,
3953 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3954 EXEC_OUTPUT_KMSG_AND_CONSOLE,
3955 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
3956 IN_SET(c->std_error,
3957 EXEC_OUTPUT_SYSLOG,
3958 EXEC_OUTPUT_KMSG,
3959 EXEC_OUTPUT_JOURNAL,
3960 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3961 EXEC_OUTPUT_KMSG_AND_CONSOLE,
3962 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
3963
3964 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
3965
3966 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
3967 if (r >= 0)
3968 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
3969
3970 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
3971 if (r >= 0)
3972 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
3973 }
3974
3975 if (c->log_level_max >= 0) {
3976 _cleanup_free_ char *t = NULL;
3977
3978 (void) log_level_to_string_alloc(c->log_level_max, &t);
3979
3980 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
3981 }
3982
3983 if (c->n_log_extra_fields > 0) {
3984 size_t j;
3985
3986 for (j = 0; j < c->n_log_extra_fields; j++) {
3987 fprintf(f, "%sLogExtraFields: ", prefix);
3988 fwrite(c->log_extra_fields[j].iov_base,
3989 1, c->log_extra_fields[j].iov_len,
3990 f);
3991 fputc('\n', f);
3992 }
3993 }
3994
3995 if (c->secure_bits) {
3996 _cleanup_free_ char *str = NULL;
3997
3998 r = secure_bits_to_string_alloc(c->secure_bits, &str);
3999 if (r >= 0)
4000 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4001 }
4002
4003 if (c->capability_bounding_set != CAP_ALL) {
4004 _cleanup_free_ char *str = NULL;
4005
4006 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4007 if (r >= 0)
4008 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4009 }
4010
4011 if (c->capability_ambient_set != 0) {
4012 _cleanup_free_ char *str = NULL;
4013
4014 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4015 if (r >= 0)
4016 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4017 }
4018
4019 if (c->user)
4020 fprintf(f, "%sUser: %s\n", prefix, c->user);
4021 if (c->group)
4022 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4023
4024 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4025
4026 if (!strv_isempty(c->supplementary_groups)) {
4027 fprintf(f, "%sSupplementaryGroups:", prefix);
4028 strv_fprintf(f, c->supplementary_groups);
4029 fputs("\n", f);
4030 }
4031
4032 if (c->pam_name)
4033 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4034
4035 if (strv_length(c->read_write_paths) > 0) {
4036 fprintf(f, "%sReadWritePaths:", prefix);
4037 strv_fprintf(f, c->read_write_paths);
4038 fputs("\n", f);
4039 }
4040
4041 if (strv_length(c->read_only_paths) > 0) {
4042 fprintf(f, "%sReadOnlyPaths:", prefix);
4043 strv_fprintf(f, c->read_only_paths);
4044 fputs("\n", f);
4045 }
4046
4047 if (strv_length(c->inaccessible_paths) > 0) {
4048 fprintf(f, "%sInaccessiblePaths:", prefix);
4049 strv_fprintf(f, c->inaccessible_paths);
4050 fputs("\n", f);
4051 }
4052
4053 if (c->n_bind_mounts > 0)
4054 for (i = 0; i < c->n_bind_mounts; i++) {
4055 fprintf(f, "%s%s: %s:%s:%s\n", prefix,
4056 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4057 c->bind_mounts[i].source,
4058 c->bind_mounts[i].destination,
4059 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4060 }
4061
4062 if (c->utmp_id)
4063 fprintf(f,
4064 "%sUtmpIdentifier: %s\n",
4065 prefix, c->utmp_id);
4066
4067 if (c->selinux_context)
4068 fprintf(f,
4069 "%sSELinuxContext: %s%s\n",
4070 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4071
4072 if (c->apparmor_profile)
4073 fprintf(f,
4074 "%sAppArmorProfile: %s%s\n",
4075 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4076
4077 if (c->smack_process_label)
4078 fprintf(f,
4079 "%sSmackProcessLabel: %s%s\n",
4080 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4081
4082 if (c->personality != PERSONALITY_INVALID)
4083 fprintf(f,
4084 "%sPersonality: %s\n",
4085 prefix, strna(personality_to_string(c->personality)));
4086
4087 fprintf(f,
4088 "%sLockPersonality: %s\n",
4089 prefix, yes_no(c->lock_personality));
4090
4091 if (c->syscall_filter) {
4092 #if HAVE_SECCOMP
4093 Iterator j;
4094 void *id, *val;
4095 bool first = true;
4096 #endif
4097
4098 fprintf(f,
4099 "%sSystemCallFilter: ",
4100 prefix);
4101
4102 if (!c->syscall_whitelist)
4103 fputc('~', f);
4104
4105 #if HAVE_SECCOMP
4106 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4107 _cleanup_free_ char *name = NULL;
4108 const char *errno_name = NULL;
4109 int num = PTR_TO_INT(val);
4110
4111 if (first)
4112 first = false;
4113 else
4114 fputc(' ', f);
4115
4116 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4117 fputs(strna(name), f);
4118
4119 if (num >= 0) {
4120 errno_name = errno_to_name(num);
4121 if (errno_name)
4122 fprintf(f, ":%s", errno_name);
4123 else
4124 fprintf(f, ":%d", num);
4125 }
4126 }
4127 #endif
4128
4129 fputc('\n', f);
4130 }
4131
4132 if (c->syscall_archs) {
4133 #if HAVE_SECCOMP
4134 Iterator j;
4135 void *id;
4136 #endif
4137
4138 fprintf(f,
4139 "%sSystemCallArchitectures:",
4140 prefix);
4141
4142 #if HAVE_SECCOMP
4143 SET_FOREACH(id, c->syscall_archs, j)
4144 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4145 #endif
4146 fputc('\n', f);
4147 }
4148
4149 if (exec_context_restrict_namespaces_set(c)) {
4150 _cleanup_free_ char *s = NULL;
4151
4152 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4153 if (r >= 0)
4154 fprintf(f, "%sRestrictNamespaces: %s\n",
4155 prefix, s);
4156 }
4157
4158 if (c->syscall_errno > 0) {
4159 const char *errno_name;
4160
4161 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4162
4163 errno_name = errno_to_name(c->syscall_errno);
4164 if (errno_name)
4165 fprintf(f, "%s\n", errno_name);
4166 else
4167 fprintf(f, "%d\n", c->syscall_errno);
4168 }
4169
4170 if (c->apparmor_profile)
4171 fprintf(f,
4172 "%sAppArmorProfile: %s%s\n",
4173 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4174 }
4175
4176 bool exec_context_maintains_privileges(ExecContext *c) {
4177 assert(c);
4178
4179 /* Returns true if the process forked off would run under
4180 * an unchanged UID or as root. */
4181
4182 if (!c->user)
4183 return true;
4184
4185 if (streq(c->user, "root") || streq(c->user, "0"))
4186 return true;
4187
4188 return false;
4189 }
4190
4191 int exec_context_get_effective_ioprio(ExecContext *c) {
4192 int p;
4193
4194 assert(c);
4195
4196 if (c->ioprio_set)
4197 return c->ioprio;
4198
4199 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4200 if (p < 0)
4201 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4202
4203 return p;
4204 }
4205
4206 void exec_context_free_log_extra_fields(ExecContext *c) {
4207 size_t l;
4208
4209 assert(c);
4210
4211 for (l = 0; l < c->n_log_extra_fields; l++)
4212 free(c->log_extra_fields[l].iov_base);
4213 c->log_extra_fields = mfree(c->log_extra_fields);
4214 c->n_log_extra_fields = 0;
4215 }
4216
4217 void exec_status_start(ExecStatus *s, pid_t pid) {
4218 assert(s);
4219
4220 zero(*s);
4221 s->pid = pid;
4222 dual_timestamp_get(&s->start_timestamp);
4223 }
4224
4225 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4226 assert(s);
4227
4228 if (s->pid && s->pid != pid)
4229 zero(*s);
4230
4231 s->pid = pid;
4232 dual_timestamp_get(&s->exit_timestamp);
4233
4234 s->code = code;
4235 s->status = status;
4236
4237 if (context) {
4238 if (context->utmp_id)
4239 utmp_put_dead_process(context->utmp_id, pid, code, status);
4240
4241 exec_context_tty_reset(context, NULL);
4242 }
4243 }
4244
4245 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4246 char buf[FORMAT_TIMESTAMP_MAX];
4247
4248 assert(s);
4249 assert(f);
4250
4251 if (s->pid <= 0)
4252 return;
4253
4254 prefix = strempty(prefix);
4255
4256 fprintf(f,
4257 "%sPID: "PID_FMT"\n",
4258 prefix, s->pid);
4259
4260 if (dual_timestamp_is_set(&s->start_timestamp))
4261 fprintf(f,
4262 "%sStart Timestamp: %s\n",
4263 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4264
4265 if (dual_timestamp_is_set(&s->exit_timestamp))
4266 fprintf(f,
4267 "%sExit Timestamp: %s\n"
4268 "%sExit Code: %s\n"
4269 "%sExit Status: %i\n",
4270 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4271 prefix, sigchld_code_to_string(s->code),
4272 prefix, s->status);
4273 }
4274
4275 char *exec_command_line(char **argv) {
4276 size_t k;
4277 char *n, *p, **a;
4278 bool first = true;
4279
4280 assert(argv);
4281
4282 k = 1;
4283 STRV_FOREACH(a, argv)
4284 k += strlen(*a)+3;
4285
4286 n = new(char, k);
4287 if (!n)
4288 return NULL;
4289
4290 p = n;
4291 STRV_FOREACH(a, argv) {
4292
4293 if (!first)
4294 *(p++) = ' ';
4295 else
4296 first = false;
4297
4298 if (strpbrk(*a, WHITESPACE)) {
4299 *(p++) = '\'';
4300 p = stpcpy(p, *a);
4301 *(p++) = '\'';
4302 } else
4303 p = stpcpy(p, *a);
4304
4305 }
4306
4307 *p = 0;
4308
4309 /* FIXME: this doesn't really handle arguments that have
4310 * spaces and ticks in them */
4311
4312 return n;
4313 }
4314
4315 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4316 _cleanup_free_ char *cmd = NULL;
4317 const char *prefix2;
4318
4319 assert(c);
4320 assert(f);
4321
4322 prefix = strempty(prefix);
4323 prefix2 = strjoina(prefix, "\t");
4324
4325 cmd = exec_command_line(c->argv);
4326 fprintf(f,
4327 "%sCommand Line: %s\n",
4328 prefix, cmd ? cmd : strerror(ENOMEM));
4329
4330 exec_status_dump(&c->exec_status, f, prefix2);
4331 }
4332
4333 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4334 assert(f);
4335
4336 prefix = strempty(prefix);
4337
4338 LIST_FOREACH(command, c, c)
4339 exec_command_dump(c, f, prefix);
4340 }
4341
4342 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4343 ExecCommand *end;
4344
4345 assert(l);
4346 assert(e);
4347
4348 if (*l) {
4349 /* It's kind of important, that we keep the order here */
4350 LIST_FIND_TAIL(command, *l, end);
4351 LIST_INSERT_AFTER(command, *l, end, e);
4352 } else
4353 *l = e;
4354 }
4355
4356 int exec_command_set(ExecCommand *c, const char *path, ...) {
4357 va_list ap;
4358 char **l, *p;
4359
4360 assert(c);
4361 assert(path);
4362
4363 va_start(ap, path);
4364 l = strv_new_ap(path, ap);
4365 va_end(ap);
4366
4367 if (!l)
4368 return -ENOMEM;
4369
4370 p = strdup(path);
4371 if (!p) {
4372 strv_free(l);
4373 return -ENOMEM;
4374 }
4375
4376 free(c->path);
4377 c->path = p;
4378
4379 strv_free(c->argv);
4380 c->argv = l;
4381
4382 return 0;
4383 }
4384
4385 int exec_command_append(ExecCommand *c, const char *path, ...) {
4386 _cleanup_strv_free_ char **l = NULL;
4387 va_list ap;
4388 int r;
4389
4390 assert(c);
4391 assert(path);
4392
4393 va_start(ap, path);
4394 l = strv_new_ap(path, ap);
4395 va_end(ap);
4396
4397 if (!l)
4398 return -ENOMEM;
4399
4400 r = strv_extend_strv(&c->argv, l, false);
4401 if (r < 0)
4402 return r;
4403
4404 return 0;
4405 }
4406
4407
4408 static int exec_runtime_allocate(ExecRuntime **rt) {
4409
4410 if (*rt)
4411 return 0;
4412
4413 *rt = new0(ExecRuntime, 1);
4414 if (!*rt)
4415 return -ENOMEM;
4416
4417 (*rt)->n_ref = 1;
4418 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4419
4420 return 0;
4421 }
4422
4423 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4424 int r;
4425
4426 assert(rt);
4427 assert(c);
4428 assert(id);
4429
4430 if (*rt)
4431 return 1;
4432
4433 if (!c->private_network && !c->private_tmp)
4434 return 0;
4435
4436 r = exec_runtime_allocate(rt);
4437 if (r < 0)
4438 return r;
4439
4440 if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4441 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4442 return -errno;
4443 }
4444
4445 if (c->private_tmp && !(*rt)->tmp_dir) {
4446 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4447 if (r < 0)
4448 return r;
4449 }
4450
4451 return 1;
4452 }
4453
4454 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4455 assert(r);
4456 assert(r->n_ref > 0);
4457
4458 r->n_ref++;
4459 return r;
4460 }
4461
4462 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4463
4464 if (!r)
4465 return NULL;
4466
4467 assert(r->n_ref > 0);
4468
4469 r->n_ref--;
4470 if (r->n_ref > 0)
4471 return NULL;
4472
4473 free(r->tmp_dir);
4474 free(r->var_tmp_dir);
4475 safe_close_pair(r->netns_storage_socket);
4476 return mfree(r);
4477 }
4478
4479 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4480 assert(u);
4481 assert(f);
4482 assert(fds);
4483
4484 if (!rt)
4485 return 0;
4486
4487 if (rt->tmp_dir)
4488 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4489
4490 if (rt->var_tmp_dir)
4491 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4492
4493 if (rt->netns_storage_socket[0] >= 0) {
4494 int copy;
4495
4496 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4497 if (copy < 0)
4498 return copy;
4499
4500 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4501 }
4502
4503 if (rt->netns_storage_socket[1] >= 0) {
4504 int copy;
4505
4506 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4507 if (copy < 0)
4508 return copy;
4509
4510 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4511 }
4512
4513 return 0;
4514 }
4515
4516 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4517 int r;
4518
4519 assert(rt);
4520 assert(key);
4521 assert(value);
4522
4523 if (streq(key, "tmp-dir")) {
4524 char *copy;
4525
4526 r = exec_runtime_allocate(rt);
4527 if (r < 0)
4528 return log_oom();
4529
4530 copy = strdup(value);
4531 if (!copy)
4532 return log_oom();
4533
4534 free((*rt)->tmp_dir);
4535 (*rt)->tmp_dir = copy;
4536
4537 } else if (streq(key, "var-tmp-dir")) {
4538 char *copy;
4539
4540 r = exec_runtime_allocate(rt);
4541 if (r < 0)
4542 return log_oom();
4543
4544 copy = strdup(value);
4545 if (!copy)
4546 return log_oom();
4547
4548 free((*rt)->var_tmp_dir);
4549 (*rt)->var_tmp_dir = copy;
4550
4551 } else if (streq(key, "netns-socket-0")) {
4552 int fd;
4553
4554 r = exec_runtime_allocate(rt);
4555 if (r < 0)
4556 return log_oom();
4557
4558 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4559 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4560 else {
4561 safe_close((*rt)->netns_storage_socket[0]);
4562 (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4563 }
4564 } else if (streq(key, "netns-socket-1")) {
4565 int fd;
4566
4567 r = exec_runtime_allocate(rt);
4568 if (r < 0)
4569 return log_oom();
4570
4571 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4572 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4573 else {
4574 safe_close((*rt)->netns_storage_socket[1]);
4575 (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4576 }
4577 } else
4578 return 0;
4579
4580 return 1;
4581 }
4582
4583 static void *remove_tmpdir_thread(void *p) {
4584 _cleanup_free_ char *path = p;
4585
4586 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4587 return NULL;
4588 }
4589
4590 void exec_runtime_destroy(ExecRuntime *rt) {
4591 int r;
4592
4593 if (!rt)
4594 return;
4595
4596 /* If there are multiple users of this, let's leave the stuff around */
4597 if (rt->n_ref > 1)
4598 return;
4599
4600 if (rt->tmp_dir) {
4601 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4602
4603 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4604 if (r < 0) {
4605 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4606 free(rt->tmp_dir);
4607 }
4608
4609 rt->tmp_dir = NULL;
4610 }
4611
4612 if (rt->var_tmp_dir) {
4613 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4614
4615 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4616 if (r < 0) {
4617 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4618 free(rt->var_tmp_dir);
4619 }
4620
4621 rt->var_tmp_dir = NULL;
4622 }
4623
4624 safe_close_pair(rt->netns_storage_socket);
4625 }
4626
4627 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4628 [EXEC_INPUT_NULL] = "null",
4629 [EXEC_INPUT_TTY] = "tty",
4630 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4631 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4632 [EXEC_INPUT_SOCKET] = "socket",
4633 [EXEC_INPUT_NAMED_FD] = "fd",
4634 };
4635
4636 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4637
4638 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4639 [EXEC_OUTPUT_INHERIT] = "inherit",
4640 [EXEC_OUTPUT_NULL] = "null",
4641 [EXEC_OUTPUT_TTY] = "tty",
4642 [EXEC_OUTPUT_SYSLOG] = "syslog",
4643 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4644 [EXEC_OUTPUT_KMSG] = "kmsg",
4645 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4646 [EXEC_OUTPUT_JOURNAL] = "journal",
4647 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4648 [EXEC_OUTPUT_SOCKET] = "socket",
4649 [EXEC_OUTPUT_NAMED_FD] = "fd",
4650 };
4651
4652 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4653
4654 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4655 [EXEC_UTMP_INIT] = "init",
4656 [EXEC_UTMP_LOGIN] = "login",
4657 [EXEC_UTMP_USER] = "user",
4658 };
4659
4660 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4661
4662 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4663 [EXEC_PRESERVE_NO] = "no",
4664 [EXEC_PRESERVE_YES] = "yes",
4665 [EXEC_PRESERVE_RESTART] = "restart",
4666 };
4667
4668 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4669
4670 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4671 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4672 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4673 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4674 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4675 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4676 };
4677
4678 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4679
4680 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4681 [EXEC_KEYRING_INHERIT] = "inherit",
4682 [EXEC_KEYRING_PRIVATE] = "private",
4683 [EXEC_KEYRING_SHARED] = "shared",
4684 };
4685
4686 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);