]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
execute: some extra asserts
[thirdparty/systemd.git] / src / core / execute.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2010 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <fcntl.h>
22 #include <glob.h>
23 #include <grp.h>
24 #include <poll.h>
25 #include <signal.h>
26 #include <string.h>
27 #include <sys/capability.h>
28 #include <sys/eventfd.h>
29 #include <sys/mman.h>
30 #include <sys/personality.h>
31 #include <sys/prctl.h>
32 #include <sys/shm.h>
33 #include <sys/socket.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 #include <sys/un.h>
37 #include <unistd.h>
38 #include <utmpx.h>
39
40 #if HAVE_PAM
41 #include <security/pam_appl.h>
42 #endif
43
44 #if HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #if HAVE_SECCOMP
49 #include <seccomp.h>
50 #endif
51
52 #if HAVE_APPARMOR
53 #include <sys/apparmor.h>
54 #endif
55
56 #include "sd-messages.h"
57
58 #include "af-list.h"
59 #include "alloc-util.h"
60 #if HAVE_APPARMOR
61 #include "apparmor-util.h"
62 #endif
63 #include "async.h"
64 #include "barrier.h"
65 #include "cap-list.h"
66 #include "capability-util.h"
67 #include "chown-recursive.h"
68 #include "def.h"
69 #include "env-util.h"
70 #include "errno-list.h"
71 #include "execute.h"
72 #include "exit-status.h"
73 #include "fd-util.h"
74 #include "fileio.h"
75 #include "format-util.h"
76 #include "fs-util.h"
77 #include "glob-util.h"
78 #include "io-util.h"
79 #include "ioprio.h"
80 #include "label.h"
81 #include "log.h"
82 #include "macro.h"
83 #include "missing.h"
84 #include "mkdir.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "process-util.h"
89 #include "rlimit-util.h"
90 #include "rm-rf.h"
91 #if HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94 #include "securebits.h"
95 #include "securebits-util.h"
96 #include "selinux-util.h"
97 #include "signal-util.h"
98 #include "smack-util.h"
99 #include "special.h"
100 #include "string-table.h"
101 #include "string-util.h"
102 #include "strv.h"
103 #include "syslog-util.h"
104 #include "terminal-util.h"
105 #include "unit.h"
106 #include "user-util.h"
107 #include "util.h"
108 #include "utmp-wtmp.h"
109
110 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
111 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
112
113 /* This assumes there is a 'tty' group */
114 #define TTY_MODE 0620
115
116 #define SNDBUF_SIZE (8*1024*1024)
117
118 static int shift_fds(int fds[], unsigned n_fds) {
119 int start, restart_from;
120
121 if (n_fds <= 0)
122 return 0;
123
124 /* Modifies the fds array! (sorts it) */
125
126 assert(fds);
127
128 start = 0;
129 for (;;) {
130 int i;
131
132 restart_from = -1;
133
134 for (i = start; i < (int) n_fds; i++) {
135 int nfd;
136
137 /* Already at right index? */
138 if (fds[i] == i+3)
139 continue;
140
141 nfd = fcntl(fds[i], F_DUPFD, i + 3);
142 if (nfd < 0)
143 return -errno;
144
145 safe_close(fds[i]);
146 fds[i] = nfd;
147
148 /* Hmm, the fd we wanted isn't free? Then
149 * let's remember that and try again from here */
150 if (nfd != i+3 && restart_from < 0)
151 restart_from = i;
152 }
153
154 if (restart_from < 0)
155 break;
156
157 start = restart_from;
158 }
159
160 return 0;
161 }
162
163 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
164 unsigned i, n_fds;
165 int r;
166
167 n_fds = n_storage_fds + n_socket_fds;
168 if (n_fds <= 0)
169 return 0;
170
171 assert(fds);
172
173 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
174 * O_NONBLOCK only applies to socket activation though. */
175
176 for (i = 0; i < n_fds; i++) {
177
178 if (i < n_socket_fds) {
179 r = fd_nonblock(fds[i], nonblock);
180 if (r < 0)
181 return r;
182 }
183
184 /* We unconditionally drop FD_CLOEXEC from the fds,
185 * since after all we want to pass these fds to our
186 * children */
187
188 r = fd_cloexec(fds[i], false);
189 if (r < 0)
190 return r;
191 }
192
193 return 0;
194 }
195
196 static const char *exec_context_tty_path(const ExecContext *context) {
197 assert(context);
198
199 if (context->stdio_as_fds)
200 return NULL;
201
202 if (context->tty_path)
203 return context->tty_path;
204
205 return "/dev/console";
206 }
207
208 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
209 const char *path;
210
211 assert(context);
212
213 path = exec_context_tty_path(context);
214
215 if (context->tty_vhangup) {
216 if (p && p->stdin_fd >= 0)
217 (void) terminal_vhangup_fd(p->stdin_fd);
218 else if (path)
219 (void) terminal_vhangup(path);
220 }
221
222 if (context->tty_reset) {
223 if (p && p->stdin_fd >= 0)
224 (void) reset_terminal_fd(p->stdin_fd, true);
225 else if (path)
226 (void) reset_terminal(path);
227 }
228
229 if (context->tty_vt_disallocate && path)
230 (void) vt_disallocate(path);
231 }
232
233 static bool is_terminal_input(ExecInput i) {
234 return IN_SET(i,
235 EXEC_INPUT_TTY,
236 EXEC_INPUT_TTY_FORCE,
237 EXEC_INPUT_TTY_FAIL);
238 }
239
240 static bool is_terminal_output(ExecOutput o) {
241 return IN_SET(o,
242 EXEC_OUTPUT_TTY,
243 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
244 EXEC_OUTPUT_KMSG_AND_CONSOLE,
245 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
246 }
247
248 static bool is_syslog_output(ExecOutput o) {
249 return IN_SET(o,
250 EXEC_OUTPUT_SYSLOG,
251 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
252 }
253
254 static bool is_kmsg_output(ExecOutput o) {
255 return IN_SET(o,
256 EXEC_OUTPUT_KMSG,
257 EXEC_OUTPUT_KMSG_AND_CONSOLE);
258 }
259
260 static bool exec_context_needs_term(const ExecContext *c) {
261 assert(c);
262
263 /* Return true if the execution context suggests we should set $TERM to something useful. */
264
265 if (is_terminal_input(c->std_input))
266 return true;
267
268 if (is_terminal_output(c->std_output))
269 return true;
270
271 if (is_terminal_output(c->std_error))
272 return true;
273
274 return !!c->tty_path;
275 }
276
277 static int open_null_as(int flags, int nfd) {
278 int fd;
279
280 assert(nfd >= 0);
281
282 fd = open("/dev/null", flags|O_NOCTTY);
283 if (fd < 0)
284 return -errno;
285
286 return move_fd(fd, nfd, false);
287 }
288
289 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
290 static const union sockaddr_union sa = {
291 .un.sun_family = AF_UNIX,
292 .un.sun_path = "/run/systemd/journal/stdout",
293 };
294 uid_t olduid = UID_INVALID;
295 gid_t oldgid = GID_INVALID;
296 int r;
297
298 if (gid_is_valid(gid)) {
299 oldgid = getgid();
300
301 if (setegid(gid) < 0)
302 return -errno;
303 }
304
305 if (uid_is_valid(uid)) {
306 olduid = getuid();
307
308 if (seteuid(uid) < 0) {
309 r = -errno;
310 goto restore_gid;
311 }
312 }
313
314 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
315
316 /* If we fail to restore the uid or gid, things will likely
317 fail later on. This should only happen if an LSM interferes. */
318
319 if (uid_is_valid(uid))
320 (void) seteuid(olduid);
321
322 restore_gid:
323 if (gid_is_valid(gid))
324 (void) setegid(oldgid);
325
326 return r;
327 }
328
329 static int connect_logger_as(
330 Unit *unit,
331 const ExecContext *context,
332 const ExecParameters *params,
333 ExecOutput output,
334 const char *ident,
335 int nfd,
336 uid_t uid,
337 gid_t gid) {
338
339 int fd, r;
340
341 assert(context);
342 assert(params);
343 assert(output < _EXEC_OUTPUT_MAX);
344 assert(ident);
345 assert(nfd >= 0);
346
347 fd = socket(AF_UNIX, SOCK_STREAM, 0);
348 if (fd < 0)
349 return -errno;
350
351 r = connect_journal_socket(fd, uid, gid);
352 if (r < 0)
353 return r;
354
355 if (shutdown(fd, SHUT_RD) < 0) {
356 safe_close(fd);
357 return -errno;
358 }
359
360 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
361
362 dprintf(fd,
363 "%s\n"
364 "%s\n"
365 "%i\n"
366 "%i\n"
367 "%i\n"
368 "%i\n"
369 "%i\n",
370 context->syslog_identifier ?: ident,
371 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
372 context->syslog_priority,
373 !!context->syslog_level_prefix,
374 is_syslog_output(output),
375 is_kmsg_output(output),
376 is_terminal_output(output));
377
378 return move_fd(fd, nfd, false);
379 }
380 static int open_terminal_as(const char *path, int flags, int nfd) {
381 int fd;
382
383 assert(path);
384 assert(nfd >= 0);
385
386 fd = open_terminal(path, flags | O_NOCTTY);
387 if (fd < 0)
388 return fd;
389
390 return move_fd(fd, nfd, false);
391 }
392
393 static int fixup_input(
394 const ExecContext *context,
395 int socket_fd,
396 bool apply_tty_stdin) {
397
398 ExecInput std_input;
399
400 assert(context);
401
402 std_input = context->std_input;
403
404 if (is_terminal_input(std_input) && !apply_tty_stdin)
405 return EXEC_INPUT_NULL;
406
407 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
408 return EXEC_INPUT_NULL;
409
410 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
411 return EXEC_INPUT_NULL;
412
413 return std_input;
414 }
415
416 static int fixup_output(ExecOutput std_output, int socket_fd) {
417
418 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
419 return EXEC_OUTPUT_INHERIT;
420
421 return std_output;
422 }
423
424 static int setup_input(
425 const ExecContext *context,
426 const ExecParameters *params,
427 int socket_fd,
428 int named_iofds[3]) {
429
430 ExecInput i;
431
432 assert(context);
433 assert(params);
434
435 if (params->stdin_fd >= 0) {
436 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
437 return -errno;
438
439 /* Try to make this the controlling tty, if it is a tty, and reset it */
440 if (isatty(STDIN_FILENO)) {
441 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
442 (void) reset_terminal_fd(STDIN_FILENO, true);
443 }
444
445 return STDIN_FILENO;
446 }
447
448 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
449
450 switch (i) {
451
452 case EXEC_INPUT_NULL:
453 return open_null_as(O_RDONLY, STDIN_FILENO);
454
455 case EXEC_INPUT_TTY:
456 case EXEC_INPUT_TTY_FORCE:
457 case EXEC_INPUT_TTY_FAIL: {
458 int fd;
459
460 fd = acquire_terminal(exec_context_tty_path(context),
461 i == EXEC_INPUT_TTY_FAIL,
462 i == EXEC_INPUT_TTY_FORCE,
463 false,
464 USEC_INFINITY);
465 if (fd < 0)
466 return fd;
467
468 return move_fd(fd, STDIN_FILENO, false);
469 }
470
471 case EXEC_INPUT_SOCKET:
472 assert(socket_fd >= 0);
473
474 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
475
476 case EXEC_INPUT_NAMED_FD:
477 assert(named_iofds[STDIN_FILENO] >= 0);
478
479 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
480 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
481
482 case EXEC_INPUT_DATA: {
483 int fd;
484
485 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
486 if (fd < 0)
487 return fd;
488
489 return move_fd(fd, STDIN_FILENO, false);
490 }
491
492 default:
493 assert_not_reached("Unknown input type");
494 }
495 }
496
497 static int setup_output(
498 Unit *unit,
499 const ExecContext *context,
500 const ExecParameters *params,
501 int fileno,
502 int socket_fd,
503 int named_iofds[3],
504 const char *ident,
505 uid_t uid,
506 gid_t gid,
507 dev_t *journal_stream_dev,
508 ino_t *journal_stream_ino) {
509
510 ExecOutput o;
511 ExecInput i;
512 int r;
513
514 assert(unit);
515 assert(context);
516 assert(params);
517 assert(ident);
518 assert(journal_stream_dev);
519 assert(journal_stream_ino);
520
521 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
522
523 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
524 return -errno;
525
526 return STDOUT_FILENO;
527 }
528
529 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
530 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
531 return -errno;
532
533 return STDERR_FILENO;
534 }
535
536 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
537 o = fixup_output(context->std_output, socket_fd);
538
539 if (fileno == STDERR_FILENO) {
540 ExecOutput e;
541 e = fixup_output(context->std_error, socket_fd);
542
543 /* This expects the input and output are already set up */
544
545 /* Don't change the stderr file descriptor if we inherit all
546 * the way and are not on a tty */
547 if (e == EXEC_OUTPUT_INHERIT &&
548 o == EXEC_OUTPUT_INHERIT &&
549 i == EXEC_INPUT_NULL &&
550 !is_terminal_input(context->std_input) &&
551 getppid () != 1)
552 return fileno;
553
554 /* Duplicate from stdout if possible */
555 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
556 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
557
558 o = e;
559
560 } else if (o == EXEC_OUTPUT_INHERIT) {
561 /* If input got downgraded, inherit the original value */
562 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
563 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
564
565 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
566 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
567 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
568
569 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
570 if (getppid() != 1)
571 return fileno;
572
573 /* We need to open /dev/null here anew, to get the right access mode. */
574 return open_null_as(O_WRONLY, fileno);
575 }
576
577 switch (o) {
578
579 case EXEC_OUTPUT_NULL:
580 return open_null_as(O_WRONLY, fileno);
581
582 case EXEC_OUTPUT_TTY:
583 if (is_terminal_input(i))
584 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
585
586 /* We don't reset the terminal if this is just about output */
587 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
588
589 case EXEC_OUTPUT_SYSLOG:
590 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
591 case EXEC_OUTPUT_KMSG:
592 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
593 case EXEC_OUTPUT_JOURNAL:
594 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
595 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
596 if (r < 0) {
597 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
598 r = open_null_as(O_WRONLY, fileno);
599 } else {
600 struct stat st;
601
602 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
603 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
604 * services to detect whether they are connected to the journal or not.
605 *
606 * If both stdout and stderr are connected to a stream then let's make sure to store the data
607 * about STDERR as that's usually the best way to do logging. */
608
609 if (fstat(fileno, &st) >= 0 &&
610 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
611 *journal_stream_dev = st.st_dev;
612 *journal_stream_ino = st.st_ino;
613 }
614 }
615 return r;
616
617 case EXEC_OUTPUT_SOCKET:
618 assert(socket_fd >= 0);
619
620 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
621
622 case EXEC_OUTPUT_NAMED_FD:
623 assert(named_iofds[fileno] >= 0);
624
625 (void) fd_nonblock(named_iofds[fileno], false);
626 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
627
628 default:
629 assert_not_reached("Unknown error type");
630 }
631 }
632
633 static int chown_terminal(int fd, uid_t uid) {
634 struct stat st;
635
636 assert(fd >= 0);
637
638 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
639 if (isatty(fd) < 1)
640 return 0;
641
642 /* This might fail. What matters are the results. */
643 (void) fchown(fd, uid, -1);
644 (void) fchmod(fd, TTY_MODE);
645
646 if (fstat(fd, &st) < 0)
647 return -errno;
648
649 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
650 return -EPERM;
651
652 return 0;
653 }
654
655 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
656 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
657 int r;
658
659 assert(_saved_stdin);
660 assert(_saved_stdout);
661
662 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
663 if (saved_stdin < 0)
664 return -errno;
665
666 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
667 if (saved_stdout < 0)
668 return -errno;
669
670 fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
671 if (fd < 0)
672 return fd;
673
674 r = chown_terminal(fd, getuid());
675 if (r < 0)
676 return r;
677
678 r = reset_terminal_fd(fd, true);
679 if (r < 0)
680 return r;
681
682 if (dup2(fd, STDIN_FILENO) < 0)
683 return -errno;
684
685 if (dup2(fd, STDOUT_FILENO) < 0)
686 return -errno;
687
688 if (fd >= 2)
689 safe_close(fd);
690 fd = -1;
691
692 *_saved_stdin = saved_stdin;
693 *_saved_stdout = saved_stdout;
694
695 saved_stdin = saved_stdout = -1;
696
697 return 0;
698 }
699
700 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
701 assert(err < 0);
702
703 if (err == -ETIMEDOUT)
704 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
705 else {
706 errno = -err;
707 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
708 }
709 }
710
711 static void write_confirm_error(int err, const char *vc, const Unit *u) {
712 _cleanup_close_ int fd = -1;
713
714 assert(vc);
715
716 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
717 if (fd < 0)
718 return;
719
720 write_confirm_error_fd(err, fd, u);
721 }
722
723 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
724 int r = 0;
725
726 assert(saved_stdin);
727 assert(saved_stdout);
728
729 release_terminal();
730
731 if (*saved_stdin >= 0)
732 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
733 r = -errno;
734
735 if (*saved_stdout >= 0)
736 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
737 r = -errno;
738
739 *saved_stdin = safe_close(*saved_stdin);
740 *saved_stdout = safe_close(*saved_stdout);
741
742 return r;
743 }
744
745 enum {
746 CONFIRM_PRETEND_FAILURE = -1,
747 CONFIRM_PRETEND_SUCCESS = 0,
748 CONFIRM_EXECUTE = 1,
749 };
750
751 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
752 int saved_stdout = -1, saved_stdin = -1, r;
753 _cleanup_free_ char *e = NULL;
754 char c;
755
756 /* For any internal errors, assume a positive response. */
757 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
758 if (r < 0) {
759 write_confirm_error(r, vc, u);
760 return CONFIRM_EXECUTE;
761 }
762
763 /* confirm_spawn might have been disabled while we were sleeping. */
764 if (manager_is_confirm_spawn_disabled(u->manager)) {
765 r = 1;
766 goto restore_stdio;
767 }
768
769 e = ellipsize(cmdline, 60, 100);
770 if (!e) {
771 log_oom();
772 r = CONFIRM_EXECUTE;
773 goto restore_stdio;
774 }
775
776 for (;;) {
777 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
778 if (r < 0) {
779 write_confirm_error_fd(r, STDOUT_FILENO, u);
780 r = CONFIRM_EXECUTE;
781 goto restore_stdio;
782 }
783
784 switch (c) {
785 case 'c':
786 printf("Resuming normal execution.\n");
787 manager_disable_confirm_spawn();
788 r = 1;
789 break;
790 case 'D':
791 unit_dump(u, stdout, " ");
792 continue; /* ask again */
793 case 'f':
794 printf("Failing execution.\n");
795 r = CONFIRM_PRETEND_FAILURE;
796 break;
797 case 'h':
798 printf(" c - continue, proceed without asking anymore\n"
799 " D - dump, show the state of the unit\n"
800 " f - fail, don't execute the command and pretend it failed\n"
801 " h - help\n"
802 " i - info, show a short summary of the unit\n"
803 " j - jobs, show jobs that are in progress\n"
804 " s - skip, don't execute the command and pretend it succeeded\n"
805 " y - yes, execute the command\n");
806 continue; /* ask again */
807 case 'i':
808 printf(" Description: %s\n"
809 " Unit: %s\n"
810 " Command: %s\n",
811 u->id, u->description, cmdline);
812 continue; /* ask again */
813 case 'j':
814 manager_dump_jobs(u->manager, stdout, " ");
815 continue; /* ask again */
816 case 'n':
817 /* 'n' was removed in favor of 'f'. */
818 printf("Didn't understand 'n', did you mean 'f'?\n");
819 continue; /* ask again */
820 case 's':
821 printf("Skipping execution.\n");
822 r = CONFIRM_PRETEND_SUCCESS;
823 break;
824 case 'y':
825 r = CONFIRM_EXECUTE;
826 break;
827 default:
828 assert_not_reached("Unhandled choice");
829 }
830 break;
831 }
832
833 restore_stdio:
834 restore_confirm_stdio(&saved_stdin, &saved_stdout);
835 return r;
836 }
837
838 static int get_fixed_user(const ExecContext *c, const char **user,
839 uid_t *uid, gid_t *gid,
840 const char **home, const char **shell) {
841 int r;
842 const char *name;
843
844 assert(c);
845
846 if (!c->user)
847 return 0;
848
849 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
850 * (i.e. are "/" or "/bin/nologin"). */
851
852 name = c->user;
853 r = get_user_creds_clean(&name, uid, gid, home, shell);
854 if (r < 0)
855 return r;
856
857 *user = name;
858 return 0;
859 }
860
861 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
862 int r;
863 const char *name;
864
865 assert(c);
866
867 if (!c->group)
868 return 0;
869
870 name = c->group;
871 r = get_group_creds(&name, gid);
872 if (r < 0)
873 return r;
874
875 *group = name;
876 return 0;
877 }
878
879 static int get_supplementary_groups(const ExecContext *c, const char *user,
880 const char *group, gid_t gid,
881 gid_t **supplementary_gids, int *ngids) {
882 char **i;
883 int r, k = 0;
884 int ngroups_max;
885 bool keep_groups = false;
886 gid_t *groups = NULL;
887 _cleanup_free_ gid_t *l_gids = NULL;
888
889 assert(c);
890
891 /*
892 * If user is given, then lookup GID and supplementary groups list.
893 * We avoid NSS lookups for gid=0. Also we have to initialize groups
894 * here and as early as possible so we keep the list of supplementary
895 * groups of the caller.
896 */
897 if (user && gid_is_valid(gid) && gid != 0) {
898 /* First step, initialize groups from /etc/groups */
899 if (initgroups(user, gid) < 0)
900 return -errno;
901
902 keep_groups = true;
903 }
904
905 if (strv_isempty(c->supplementary_groups))
906 return 0;
907
908 /*
909 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
910 * be positive, otherwise fail.
911 */
912 errno = 0;
913 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
914 if (ngroups_max <= 0) {
915 if (errno > 0)
916 return -errno;
917 else
918 return -EOPNOTSUPP; /* For all other values */
919 }
920
921 l_gids = new(gid_t, ngroups_max);
922 if (!l_gids)
923 return -ENOMEM;
924
925 if (keep_groups) {
926 /*
927 * Lookup the list of groups that the user belongs to, we
928 * avoid NSS lookups here too for gid=0.
929 */
930 k = ngroups_max;
931 if (getgrouplist(user, gid, l_gids, &k) < 0)
932 return -EINVAL;
933 } else
934 k = 0;
935
936 STRV_FOREACH(i, c->supplementary_groups) {
937 const char *g;
938
939 if (k >= ngroups_max)
940 return -E2BIG;
941
942 g = *i;
943 r = get_group_creds(&g, l_gids+k);
944 if (r < 0)
945 return r;
946
947 k++;
948 }
949
950 /*
951 * Sets ngids to zero to drop all supplementary groups, happens
952 * when we are under root and SupplementaryGroups= is empty.
953 */
954 if (k == 0) {
955 *ngids = 0;
956 return 0;
957 }
958
959 /* Otherwise get the final list of supplementary groups */
960 groups = memdup(l_gids, sizeof(gid_t) * k);
961 if (!groups)
962 return -ENOMEM;
963
964 *supplementary_gids = groups;
965 *ngids = k;
966
967 groups = NULL;
968
969 return 0;
970 }
971
972 static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
973 int r;
974
975 /* Handle SupplementaryGroups= if it is not empty */
976 if (ngids > 0) {
977 r = maybe_setgroups(ngids, supplementary_gids);
978 if (r < 0)
979 return r;
980 }
981
982 if (gid_is_valid(gid)) {
983 /* Then set our gids */
984 if (setresgid(gid, gid, gid) < 0)
985 return -errno;
986 }
987
988 return 0;
989 }
990
991 static int enforce_user(const ExecContext *context, uid_t uid) {
992 assert(context);
993
994 if (!uid_is_valid(uid))
995 return 0;
996
997 /* Sets (but doesn't look up) the uid and make sure we keep the
998 * capabilities while doing so. */
999
1000 if (context->capability_ambient_set != 0) {
1001
1002 /* First step: If we need to keep capabilities but
1003 * drop privileges we need to make sure we keep our
1004 * caps, while we drop privileges. */
1005 if (uid != 0) {
1006 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1007
1008 if (prctl(PR_GET_SECUREBITS) != sb)
1009 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1010 return -errno;
1011 }
1012 }
1013
1014 /* Second step: actually set the uids */
1015 if (setresuid(uid, uid, uid) < 0)
1016 return -errno;
1017
1018 /* At this point we should have all necessary capabilities but
1019 are otherwise a normal user. However, the caps might got
1020 corrupted due to the setresuid() so we need clean them up
1021 later. This is done outside of this call. */
1022
1023 return 0;
1024 }
1025
1026 #if HAVE_PAM
1027
1028 static int null_conv(
1029 int num_msg,
1030 const struct pam_message **msg,
1031 struct pam_response **resp,
1032 void *appdata_ptr) {
1033
1034 /* We don't support conversations */
1035
1036 return PAM_CONV_ERR;
1037 }
1038
1039 #endif
1040
1041 static int setup_pam(
1042 const char *name,
1043 const char *user,
1044 uid_t uid,
1045 gid_t gid,
1046 const char *tty,
1047 char ***env,
1048 int fds[], unsigned n_fds) {
1049
1050 #if HAVE_PAM
1051
1052 static const struct pam_conv conv = {
1053 .conv = null_conv,
1054 .appdata_ptr = NULL
1055 };
1056
1057 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1058 pam_handle_t *handle = NULL;
1059 sigset_t old_ss;
1060 int pam_code = PAM_SUCCESS, r;
1061 char **nv, **e = NULL;
1062 bool close_session = false;
1063 pid_t pam_pid = 0, parent_pid;
1064 int flags = 0;
1065
1066 assert(name);
1067 assert(user);
1068 assert(env);
1069
1070 /* We set up PAM in the parent process, then fork. The child
1071 * will then stay around until killed via PR_GET_PDEATHSIG or
1072 * systemd via the cgroup logic. It will then remove the PAM
1073 * session again. The parent process will exec() the actual
1074 * daemon. We do things this way to ensure that the main PID
1075 * of the daemon is the one we initially fork()ed. */
1076
1077 r = barrier_create(&barrier);
1078 if (r < 0)
1079 goto fail;
1080
1081 if (log_get_max_level() < LOG_DEBUG)
1082 flags |= PAM_SILENT;
1083
1084 pam_code = pam_start(name, user, &conv, &handle);
1085 if (pam_code != PAM_SUCCESS) {
1086 handle = NULL;
1087 goto fail;
1088 }
1089
1090 if (tty) {
1091 pam_code = pam_set_item(handle, PAM_TTY, tty);
1092 if (pam_code != PAM_SUCCESS)
1093 goto fail;
1094 }
1095
1096 STRV_FOREACH(nv, *env) {
1097 pam_code = pam_putenv(handle, *nv);
1098 if (pam_code != PAM_SUCCESS)
1099 goto fail;
1100 }
1101
1102 pam_code = pam_acct_mgmt(handle, flags);
1103 if (pam_code != PAM_SUCCESS)
1104 goto fail;
1105
1106 pam_code = pam_open_session(handle, flags);
1107 if (pam_code != PAM_SUCCESS)
1108 goto fail;
1109
1110 close_session = true;
1111
1112 e = pam_getenvlist(handle);
1113 if (!e) {
1114 pam_code = PAM_BUF_ERR;
1115 goto fail;
1116 }
1117
1118 /* Block SIGTERM, so that we know that it won't get lost in
1119 * the child */
1120
1121 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1122
1123 parent_pid = getpid_cached();
1124
1125 pam_pid = fork();
1126 if (pam_pid < 0) {
1127 r = -errno;
1128 goto fail;
1129 }
1130
1131 if (pam_pid == 0) {
1132 int sig, ret = EXIT_PAM;
1133
1134 /* The child's job is to reset the PAM session on
1135 * termination */
1136 barrier_set_role(&barrier, BARRIER_CHILD);
1137
1138 /* This string must fit in 10 chars (i.e. the length
1139 * of "/sbin/init"), to look pretty in /bin/ps */
1140 rename_process("(sd-pam)");
1141
1142 /* Make sure we don't keep open the passed fds in this
1143 child. We assume that otherwise only those fds are
1144 open here that have been opened by PAM. */
1145 close_many(fds, n_fds);
1146
1147 /* Drop privileges - we don't need any to pam_close_session
1148 * and this will make PR_SET_PDEATHSIG work in most cases.
1149 * If this fails, ignore the error - but expect sd-pam threads
1150 * to fail to exit normally */
1151
1152 r = maybe_setgroups(0, NULL);
1153 if (r < 0)
1154 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1155 if (setresgid(gid, gid, gid) < 0)
1156 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1157 if (setresuid(uid, uid, uid) < 0)
1158 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1159
1160 (void) ignore_signals(SIGPIPE, -1);
1161
1162 /* Wait until our parent died. This will only work if
1163 * the above setresuid() succeeds, otherwise the kernel
1164 * will not allow unprivileged parents kill their privileged
1165 * children this way. We rely on the control groups kill logic
1166 * to do the rest for us. */
1167 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1168 goto child_finish;
1169
1170 /* Tell the parent that our setup is done. This is especially
1171 * important regarding dropping privileges. Otherwise, unit
1172 * setup might race against our setresuid(2) call.
1173 *
1174 * If the parent aborted, we'll detect this below, hence ignore
1175 * return failure here. */
1176 (void) barrier_place(&barrier);
1177
1178 /* Check if our parent process might already have died? */
1179 if (getppid() == parent_pid) {
1180 sigset_t ss;
1181
1182 assert_se(sigemptyset(&ss) >= 0);
1183 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1184
1185 for (;;) {
1186 if (sigwait(&ss, &sig) < 0) {
1187 if (errno == EINTR)
1188 continue;
1189
1190 goto child_finish;
1191 }
1192
1193 assert(sig == SIGTERM);
1194 break;
1195 }
1196 }
1197
1198 /* If our parent died we'll end the session */
1199 if (getppid() != parent_pid) {
1200 pam_code = pam_close_session(handle, flags);
1201 if (pam_code != PAM_SUCCESS)
1202 goto child_finish;
1203 }
1204
1205 ret = 0;
1206
1207 child_finish:
1208 pam_end(handle, pam_code | flags);
1209 _exit(ret);
1210 }
1211
1212 barrier_set_role(&barrier, BARRIER_PARENT);
1213
1214 /* If the child was forked off successfully it will do all the
1215 * cleanups, so forget about the handle here. */
1216 handle = NULL;
1217
1218 /* Unblock SIGTERM again in the parent */
1219 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1220
1221 /* We close the log explicitly here, since the PAM modules
1222 * might have opened it, but we don't want this fd around. */
1223 closelog();
1224
1225 /* Synchronously wait for the child to initialize. We don't care for
1226 * errors as we cannot recover. However, warn loudly if it happens. */
1227 if (!barrier_place_and_sync(&barrier))
1228 log_error("PAM initialization failed");
1229
1230 strv_free(*env);
1231 *env = e;
1232
1233 return 0;
1234
1235 fail:
1236 if (pam_code != PAM_SUCCESS) {
1237 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1238 r = -EPERM; /* PAM errors do not map to errno */
1239 } else
1240 log_error_errno(r, "PAM failed: %m");
1241
1242 if (handle) {
1243 if (close_session)
1244 pam_code = pam_close_session(handle, flags);
1245
1246 pam_end(handle, pam_code | flags);
1247 }
1248
1249 strv_free(e);
1250 closelog();
1251
1252 return r;
1253 #else
1254 return 0;
1255 #endif
1256 }
1257
1258 static void rename_process_from_path(const char *path) {
1259 char process_name[11];
1260 const char *p;
1261 size_t l;
1262
1263 /* This resulting string must fit in 10 chars (i.e. the length
1264 * of "/sbin/init") to look pretty in /bin/ps */
1265
1266 p = basename(path);
1267 if (isempty(p)) {
1268 rename_process("(...)");
1269 return;
1270 }
1271
1272 l = strlen(p);
1273 if (l > 8) {
1274 /* The end of the process name is usually more
1275 * interesting, since the first bit might just be
1276 * "systemd-" */
1277 p = p + l - 8;
1278 l = 8;
1279 }
1280
1281 process_name[0] = '(';
1282 memcpy(process_name+1, p, l);
1283 process_name[1+l] = ')';
1284 process_name[1+l+1] = 0;
1285
1286 rename_process(process_name);
1287 }
1288
1289 static bool context_has_address_families(const ExecContext *c) {
1290 assert(c);
1291
1292 return c->address_families_whitelist ||
1293 !set_isempty(c->address_families);
1294 }
1295
1296 static bool context_has_syscall_filters(const ExecContext *c) {
1297 assert(c);
1298
1299 return c->syscall_whitelist ||
1300 !hashmap_isempty(c->syscall_filter);
1301 }
1302
1303 static bool context_has_no_new_privileges(const ExecContext *c) {
1304 assert(c);
1305
1306 if (c->no_new_privileges)
1307 return true;
1308
1309 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1310 return false;
1311
1312 /* We need NNP if we have any form of seccomp and are unprivileged */
1313 return context_has_address_families(c) ||
1314 c->memory_deny_write_execute ||
1315 c->restrict_realtime ||
1316 exec_context_restrict_namespaces_set(c) ||
1317 c->protect_kernel_tunables ||
1318 c->protect_kernel_modules ||
1319 c->private_devices ||
1320 context_has_syscall_filters(c) ||
1321 !set_isempty(c->syscall_archs) ||
1322 c->lock_personality;
1323 }
1324
1325 #if HAVE_SECCOMP
1326
1327 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1328
1329 if (is_seccomp_available())
1330 return false;
1331
1332 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1333 return true;
1334 }
1335
1336 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1337 uint32_t negative_action, default_action, action;
1338 int r;
1339
1340 assert(u);
1341 assert(c);
1342
1343 if (!context_has_syscall_filters(c))
1344 return 0;
1345
1346 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1347 return 0;
1348
1349 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1350
1351 if (c->syscall_whitelist) {
1352 default_action = negative_action;
1353 action = SCMP_ACT_ALLOW;
1354 } else {
1355 default_action = SCMP_ACT_ALLOW;
1356 action = negative_action;
1357 }
1358
1359 if (needs_ambient_hack) {
1360 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1361 if (r < 0)
1362 return r;
1363 }
1364
1365 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1366 }
1367
1368 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1369 assert(u);
1370 assert(c);
1371
1372 if (set_isempty(c->syscall_archs))
1373 return 0;
1374
1375 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1376 return 0;
1377
1378 return seccomp_restrict_archs(c->syscall_archs);
1379 }
1380
1381 static int apply_address_families(const Unit* u, const ExecContext *c) {
1382 assert(u);
1383 assert(c);
1384
1385 if (!context_has_address_families(c))
1386 return 0;
1387
1388 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1389 return 0;
1390
1391 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1392 }
1393
1394 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1395 assert(u);
1396 assert(c);
1397
1398 if (!c->memory_deny_write_execute)
1399 return 0;
1400
1401 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1402 return 0;
1403
1404 return seccomp_memory_deny_write_execute();
1405 }
1406
1407 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1408 assert(u);
1409 assert(c);
1410
1411 if (!c->restrict_realtime)
1412 return 0;
1413
1414 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1415 return 0;
1416
1417 return seccomp_restrict_realtime();
1418 }
1419
1420 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1421 assert(u);
1422 assert(c);
1423
1424 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1425 * let's protect even those systems where this is left on in the kernel. */
1426
1427 if (!c->protect_kernel_tunables)
1428 return 0;
1429
1430 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1431 return 0;
1432
1433 return seccomp_protect_sysctl();
1434 }
1435
1436 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1437 assert(u);
1438 assert(c);
1439
1440 /* Turn off module syscalls on ProtectKernelModules=yes */
1441
1442 if (!c->protect_kernel_modules)
1443 return 0;
1444
1445 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1446 return 0;
1447
1448 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1449 }
1450
1451 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1452 assert(u);
1453 assert(c);
1454
1455 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1456
1457 if (!c->private_devices)
1458 return 0;
1459
1460 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1461 return 0;
1462
1463 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1464 }
1465
1466 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1467 assert(u);
1468 assert(c);
1469
1470 if (!exec_context_restrict_namespaces_set(c))
1471 return 0;
1472
1473 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1474 return 0;
1475
1476 return seccomp_restrict_namespaces(c->restrict_namespaces);
1477 }
1478
1479 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1480 unsigned long personality;
1481 int r;
1482
1483 assert(u);
1484 assert(c);
1485
1486 if (!c->lock_personality)
1487 return 0;
1488
1489 if (skip_seccomp_unavailable(u, "LockPersonality="))
1490 return 0;
1491
1492 personality = c->personality;
1493
1494 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1495 if (personality == PERSONALITY_INVALID) {
1496
1497 r = opinionated_personality(&personality);
1498 if (r < 0)
1499 return r;
1500 }
1501
1502 return seccomp_lock_personality(personality);
1503 }
1504
1505 #endif
1506
1507 static void do_idle_pipe_dance(int idle_pipe[4]) {
1508 assert(idle_pipe);
1509
1510 idle_pipe[1] = safe_close(idle_pipe[1]);
1511 idle_pipe[2] = safe_close(idle_pipe[2]);
1512
1513 if (idle_pipe[0] >= 0) {
1514 int r;
1515
1516 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1517
1518 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1519 ssize_t n;
1520
1521 /* Signal systemd that we are bored and want to continue. */
1522 n = write(idle_pipe[3], "x", 1);
1523 if (n > 0)
1524 /* Wait for systemd to react to the signal above. */
1525 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1526 }
1527
1528 idle_pipe[0] = safe_close(idle_pipe[0]);
1529
1530 }
1531
1532 idle_pipe[3] = safe_close(idle_pipe[3]);
1533 }
1534
1535 static int build_environment(
1536 Unit *u,
1537 const ExecContext *c,
1538 const ExecParameters *p,
1539 unsigned n_fds,
1540 const char *home,
1541 const char *username,
1542 const char *shell,
1543 dev_t journal_stream_dev,
1544 ino_t journal_stream_ino,
1545 char ***ret) {
1546
1547 _cleanup_strv_free_ char **our_env = NULL;
1548 unsigned n_env = 0;
1549 char *x;
1550
1551 assert(u);
1552 assert(c);
1553 assert(ret);
1554
1555 our_env = new0(char*, 14);
1556 if (!our_env)
1557 return -ENOMEM;
1558
1559 if (n_fds > 0) {
1560 _cleanup_free_ char *joined = NULL;
1561
1562 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1563 return -ENOMEM;
1564 our_env[n_env++] = x;
1565
1566 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1567 return -ENOMEM;
1568 our_env[n_env++] = x;
1569
1570 joined = strv_join(p->fd_names, ":");
1571 if (!joined)
1572 return -ENOMEM;
1573
1574 x = strjoin("LISTEN_FDNAMES=", joined);
1575 if (!x)
1576 return -ENOMEM;
1577 our_env[n_env++] = x;
1578 }
1579
1580 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1581 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1582 return -ENOMEM;
1583 our_env[n_env++] = x;
1584
1585 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1586 return -ENOMEM;
1587 our_env[n_env++] = x;
1588 }
1589
1590 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1591 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1592 * check the database directly. */
1593 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1594 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1595 if (!x)
1596 return -ENOMEM;
1597 our_env[n_env++] = x;
1598 }
1599
1600 if (home) {
1601 x = strappend("HOME=", home);
1602 if (!x)
1603 return -ENOMEM;
1604 our_env[n_env++] = x;
1605 }
1606
1607 if (username) {
1608 x = strappend("LOGNAME=", username);
1609 if (!x)
1610 return -ENOMEM;
1611 our_env[n_env++] = x;
1612
1613 x = strappend("USER=", username);
1614 if (!x)
1615 return -ENOMEM;
1616 our_env[n_env++] = x;
1617 }
1618
1619 if (shell) {
1620 x = strappend("SHELL=", shell);
1621 if (!x)
1622 return -ENOMEM;
1623 our_env[n_env++] = x;
1624 }
1625
1626 if (!sd_id128_is_null(u->invocation_id)) {
1627 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1628 return -ENOMEM;
1629
1630 our_env[n_env++] = x;
1631 }
1632
1633 if (exec_context_needs_term(c)) {
1634 const char *tty_path, *term = NULL;
1635
1636 tty_path = exec_context_tty_path(c);
1637
1638 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1639 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1640 * passes to PID 1 ends up all the way in the console login shown. */
1641
1642 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1643 term = getenv("TERM");
1644 if (!term)
1645 term = default_term_for_tty(tty_path);
1646
1647 x = strappend("TERM=", term);
1648 if (!x)
1649 return -ENOMEM;
1650 our_env[n_env++] = x;
1651 }
1652
1653 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1654 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1655 return -ENOMEM;
1656
1657 our_env[n_env++] = x;
1658 }
1659
1660 our_env[n_env++] = NULL;
1661 assert(n_env <= 12);
1662
1663 *ret = our_env;
1664 our_env = NULL;
1665
1666 return 0;
1667 }
1668
1669 static int build_pass_environment(const ExecContext *c, char ***ret) {
1670 _cleanup_strv_free_ char **pass_env = NULL;
1671 size_t n_env = 0, n_bufsize = 0;
1672 char **i;
1673
1674 STRV_FOREACH(i, c->pass_environment) {
1675 _cleanup_free_ char *x = NULL;
1676 char *v;
1677
1678 v = getenv(*i);
1679 if (!v)
1680 continue;
1681 x = strjoin(*i, "=", v);
1682 if (!x)
1683 return -ENOMEM;
1684
1685 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1686 return -ENOMEM;
1687
1688 pass_env[n_env++] = x;
1689 pass_env[n_env] = NULL;
1690 x = NULL;
1691 }
1692
1693 *ret = pass_env;
1694 pass_env = NULL;
1695
1696 return 0;
1697 }
1698
1699 static bool exec_needs_mount_namespace(
1700 const ExecContext *context,
1701 const ExecParameters *params,
1702 ExecRuntime *runtime) {
1703
1704 assert(context);
1705 assert(params);
1706
1707 if (context->root_image)
1708 return true;
1709
1710 if (!strv_isempty(context->read_write_paths) ||
1711 !strv_isempty(context->read_only_paths) ||
1712 !strv_isempty(context->inaccessible_paths))
1713 return true;
1714
1715 if (context->n_bind_mounts > 0 ||
1716 !strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
1717 !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1718 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1719 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths) ||
1720 !strv_isempty(context->directories[EXEC_DIRECTORY_CONFIGURATION].paths))
1721 return true;
1722
1723 if (context->mount_flags != 0)
1724 return true;
1725
1726 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1727 return true;
1728
1729 if (context->private_devices ||
1730 context->protect_system != PROTECT_SYSTEM_NO ||
1731 context->protect_home != PROTECT_HOME_NO ||
1732 context->protect_kernel_tunables ||
1733 context->protect_kernel_modules ||
1734 context->protect_control_groups)
1735 return true;
1736
1737 if (context->mount_apivfs && (context->root_image || context->root_directory))
1738 return true;
1739
1740 return false;
1741 }
1742
1743 static int setup_private_users(uid_t uid, gid_t gid) {
1744 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1745 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1746 _cleanup_close_ int unshare_ready_fd = -1;
1747 _cleanup_(sigkill_waitp) pid_t pid = 0;
1748 uint64_t c = 1;
1749 siginfo_t si;
1750 ssize_t n;
1751 int r;
1752
1753 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1754 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1755 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1756 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1757 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1758 * continues execution normally. */
1759
1760 if (uid != 0 && uid_is_valid(uid)) {
1761 r = asprintf(&uid_map,
1762 "0 0 1\n" /* Map root → root */
1763 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1764 uid, uid);
1765 if (r < 0)
1766 return -ENOMEM;
1767 } else {
1768 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1769 if (!uid_map)
1770 return -ENOMEM;
1771 }
1772
1773 if (gid != 0 && gid_is_valid(gid)) {
1774 r = asprintf(&gid_map,
1775 "0 0 1\n" /* Map root → root */
1776 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1777 gid, gid);
1778 if (r < 0)
1779 return -ENOMEM;
1780 } else {
1781 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1782 if (!gid_map)
1783 return -ENOMEM;
1784 }
1785
1786 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1787 * namespace. */
1788 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1789 if (unshare_ready_fd < 0)
1790 return -errno;
1791
1792 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1793 * failed. */
1794 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1795 return -errno;
1796
1797 pid = fork();
1798 if (pid < 0)
1799 return -errno;
1800
1801 if (pid == 0) {
1802 _cleanup_close_ int fd = -1;
1803 const char *a;
1804 pid_t ppid;
1805
1806 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1807 * here, after the parent opened its own user namespace. */
1808
1809 ppid = getppid();
1810 errno_pipe[0] = safe_close(errno_pipe[0]);
1811
1812 /* Wait until the parent unshared the user namespace */
1813 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1814 r = -errno;
1815 goto child_fail;
1816 }
1817
1818 /* Disable the setgroups() system call in the child user namespace, for good. */
1819 a = procfs_file_alloca(ppid, "setgroups");
1820 fd = open(a, O_WRONLY|O_CLOEXEC);
1821 if (fd < 0) {
1822 if (errno != ENOENT) {
1823 r = -errno;
1824 goto child_fail;
1825 }
1826
1827 /* If the file is missing the kernel is too old, let's continue anyway. */
1828 } else {
1829 if (write(fd, "deny\n", 5) < 0) {
1830 r = -errno;
1831 goto child_fail;
1832 }
1833
1834 fd = safe_close(fd);
1835 }
1836
1837 /* First write the GID map */
1838 a = procfs_file_alloca(ppid, "gid_map");
1839 fd = open(a, O_WRONLY|O_CLOEXEC);
1840 if (fd < 0) {
1841 r = -errno;
1842 goto child_fail;
1843 }
1844 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1845 r = -errno;
1846 goto child_fail;
1847 }
1848 fd = safe_close(fd);
1849
1850 /* The write the UID map */
1851 a = procfs_file_alloca(ppid, "uid_map");
1852 fd = open(a, O_WRONLY|O_CLOEXEC);
1853 if (fd < 0) {
1854 r = -errno;
1855 goto child_fail;
1856 }
1857 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1858 r = -errno;
1859 goto child_fail;
1860 }
1861
1862 _exit(EXIT_SUCCESS);
1863
1864 child_fail:
1865 (void) write(errno_pipe[1], &r, sizeof(r));
1866 _exit(EXIT_FAILURE);
1867 }
1868
1869 errno_pipe[1] = safe_close(errno_pipe[1]);
1870
1871 if (unshare(CLONE_NEWUSER) < 0)
1872 return -errno;
1873
1874 /* Let the child know that the namespace is ready now */
1875 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1876 return -errno;
1877
1878 /* Try to read an error code from the child */
1879 n = read(errno_pipe[0], &r, sizeof(r));
1880 if (n < 0)
1881 return -errno;
1882 if (n == sizeof(r)) { /* an error code was sent to us */
1883 if (r < 0)
1884 return r;
1885 return -EIO;
1886 }
1887 if (n != 0) /* on success we should have read 0 bytes */
1888 return -EIO;
1889
1890 r = wait_for_terminate(pid, &si);
1891 if (r < 0)
1892 return r;
1893 pid = 0;
1894
1895 /* If something strange happened with the child, let's consider this fatal, too */
1896 if (si.si_code != CLD_EXITED || si.si_status != 0)
1897 return -EIO;
1898
1899 return 0;
1900 }
1901
1902 static int setup_exec_directory(
1903 const ExecContext *context,
1904 const ExecParameters *params,
1905 uid_t uid,
1906 gid_t gid,
1907 ExecDirectoryType type,
1908 int *exit_status) {
1909
1910 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1911 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1912 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1913 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1914 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1915 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1916 };
1917 char **rt;
1918 int r;
1919
1920 assert(context);
1921 assert(params);
1922 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1923 assert(exit_status);
1924
1925 if (!params->prefix[type])
1926 return 0;
1927
1928 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1929 if (!uid_is_valid(uid))
1930 uid = 0;
1931 if (!gid_is_valid(gid))
1932 gid = 0;
1933 }
1934
1935 STRV_FOREACH(rt, context->directories[type].paths) {
1936 _cleanup_free_ char *p = NULL, *pp = NULL;
1937 const char *effective;
1938
1939 p = strjoin(params->prefix[type], "/", *rt);
1940 if (!p) {
1941 r = -ENOMEM;
1942 goto fail;
1943 }
1944
1945 r = mkdir_parents_label(p, 0755);
1946 if (r < 0)
1947 goto fail;
1948
1949 if (context->dynamic_user &&
1950 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
1951 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
1952
1953 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1954 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1955 * whose UID is later on reused. To lock this down we use the same trick used by container
1956 * managers to prohibit host users to get access to files of the same UID in containers: we
1957 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1958 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1959 * to make this directory permeable for the service itself.
1960 *
1961 * Specifically: for a service which wants a special directory "foo/" we first create a
1962 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1963 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1964 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1965 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1966 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1967 * disabling the access boundary for the service and making sure it only gets access to the
1968 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1969 *
1970 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1971 * owned by the service itself.
1972 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
1973 * files or sockets with other services. */
1974
1975 private_root = strjoin(params->prefix[type], "/private");
1976 if (!private_root) {
1977 r = -ENOMEM;
1978 goto fail;
1979 }
1980
1981 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1982 r = mkdir_safe_label(private_root, 0700, 0, 0, false);
1983 if (r < 0)
1984 goto fail;
1985
1986 pp = strjoin(private_root, "/", *rt);
1987 if (!pp) {
1988 r = -ENOMEM;
1989 goto fail;
1990 }
1991
1992 /* Create all directories between the configured directory and this private root, and mark them 0755 */
1993 r = mkdir_parents_label(pp, 0755);
1994 if (r < 0)
1995 goto fail;
1996
1997 /* Finally, create the actual directory for the service */
1998 r = mkdir_label(pp, context->directories[type].mode);
1999 if (r < 0 && r != -EEXIST)
2000 goto fail;
2001
2002 parent = dirname_malloc(p);
2003 if (!parent) {
2004 r = -ENOMEM;
2005 goto fail;
2006 }
2007
2008 r = path_make_relative(parent, pp, &relative);
2009 if (r < 0)
2010 goto fail;
2011
2012 /* And link it up from the original place */
2013 r = symlink_idempotent(relative, p);
2014 if (r < 0)
2015 goto fail;
2016
2017 effective = pp;
2018
2019 } else {
2020 r = mkdir_label(p, context->directories[type].mode);
2021 if (r < 0 && r != -EEXIST)
2022 goto fail;
2023
2024 effective = p;
2025 }
2026
2027 /* First lock down the access mode */
2028 if (chmod(effective, context->directories[type].mode) < 0) {
2029 r = -errno;
2030 goto fail;
2031 }
2032
2033 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2034 * a service, and shall not be writable. */
2035 if (type == EXEC_DIRECTORY_CONFIGURATION)
2036 continue;
2037
2038 /* Then, change the ownership of the whole tree, if necessary */
2039 r = path_chown_recursive(effective, uid, gid);
2040 if (r < 0)
2041 goto fail;
2042 }
2043
2044 return 0;
2045
2046 fail:
2047 *exit_status = exit_status_table[type];
2048 return r;
2049 }
2050
2051 static int setup_smack(
2052 const ExecContext *context,
2053 const ExecCommand *command) {
2054
2055 int r;
2056
2057 assert(context);
2058 assert(command);
2059
2060 if (context->smack_process_label) {
2061 r = mac_smack_apply_pid(0, context->smack_process_label);
2062 if (r < 0)
2063 return r;
2064 }
2065 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2066 else {
2067 _cleanup_free_ char *exec_label = NULL;
2068
2069 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2070 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2071 return r;
2072
2073 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2074 if (r < 0)
2075 return r;
2076 }
2077 #endif
2078
2079 return 0;
2080 }
2081
2082 static int compile_bind_mounts(
2083 const ExecContext *context,
2084 const ExecParameters *params,
2085 BindMount **ret_bind_mounts,
2086 unsigned *ret_n_bind_mounts,
2087 char ***ret_empty_directories) {
2088
2089 _cleanup_strv_free_ char **empty_directories = NULL;
2090 BindMount *bind_mounts;
2091 unsigned n, h = 0, i;
2092 ExecDirectoryType t;
2093 int r;
2094
2095 assert(context);
2096 assert(params);
2097 assert(ret_bind_mounts);
2098 assert(ret_n_bind_mounts);
2099 assert(ret_empty_directories);
2100
2101 n = context->n_bind_mounts;
2102 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2103 if (!params->prefix[t])
2104 continue;
2105
2106 n += strv_length(context->directories[t].paths);
2107 }
2108
2109 if (n <= 0) {
2110 *ret_bind_mounts = NULL;
2111 *ret_n_bind_mounts = 0;
2112 *ret_empty_directories = NULL;
2113 return 0;
2114 }
2115
2116 bind_mounts = new(BindMount, n);
2117 if (!bind_mounts)
2118 return -ENOMEM;
2119
2120 for (i = 0; i < context->n_bind_mounts; i++) {
2121 BindMount *item = context->bind_mounts + i;
2122 char *s, *d;
2123
2124 s = strdup(item->source);
2125 if (!s) {
2126 r = -ENOMEM;
2127 goto finish;
2128 }
2129
2130 d = strdup(item->destination);
2131 if (!d) {
2132 free(s);
2133 r = -ENOMEM;
2134 goto finish;
2135 }
2136
2137 bind_mounts[h++] = (BindMount) {
2138 .source = s,
2139 .destination = d,
2140 .read_only = item->read_only,
2141 .recursive = item->recursive,
2142 .ignore_enoent = item->ignore_enoent,
2143 };
2144 }
2145
2146 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2147 char **suffix;
2148
2149 if (!params->prefix[t])
2150 continue;
2151
2152 if (strv_isempty(context->directories[t].paths))
2153 continue;
2154
2155 if (context->dynamic_user &&
2156 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2157 char *private_root;
2158
2159 /* So this is for a dynamic user, and we need to make sure the process can access its own
2160 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2161 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2162
2163 private_root = strjoin(params->prefix[t], "/private");
2164 if (!private_root) {
2165 r = -ENOMEM;
2166 goto finish;
2167 }
2168
2169 r = strv_consume(&empty_directories, private_root);
2170 if (r < 0) {
2171 r = -ENOMEM;
2172 goto finish;
2173 }
2174 }
2175
2176 STRV_FOREACH(suffix, context->directories[t].paths) {
2177 char *s, *d;
2178
2179 if (context->dynamic_user &&
2180 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2181 s = strjoin(params->prefix[t], "/private/", *suffix);
2182 else
2183 s = strjoin(params->prefix[t], "/", *suffix);
2184 if (!s) {
2185 r = -ENOMEM;
2186 goto finish;
2187 }
2188
2189 d = strdup(s);
2190 if (!d) {
2191 free(s);
2192 r = -ENOMEM;
2193 goto finish;
2194 }
2195
2196 bind_mounts[h++] = (BindMount) {
2197 .source = s,
2198 .destination = d,
2199 .read_only = false,
2200 .recursive = true,
2201 .ignore_enoent = false,
2202 };
2203 }
2204 }
2205
2206 assert(h == n);
2207
2208 *ret_bind_mounts = bind_mounts;
2209 *ret_n_bind_mounts = n;
2210 *ret_empty_directories = empty_directories;
2211
2212 empty_directories = NULL;
2213
2214 return (int) n;
2215
2216 finish:
2217 bind_mount_free_many(bind_mounts, h);
2218 return r;
2219 }
2220
2221 static int apply_mount_namespace(
2222 Unit *u,
2223 ExecCommand *command,
2224 const ExecContext *context,
2225 const ExecParameters *params,
2226 ExecRuntime *runtime) {
2227
2228 _cleanup_strv_free_ char **empty_directories = NULL;
2229 char *tmp = NULL, *var = NULL;
2230 const char *root_dir = NULL, *root_image = NULL;
2231 NamespaceInfo ns_info = {
2232 .ignore_protect_paths = false,
2233 .private_dev = context->private_devices,
2234 .protect_control_groups = context->protect_control_groups,
2235 .protect_kernel_tunables = context->protect_kernel_tunables,
2236 .protect_kernel_modules = context->protect_kernel_modules,
2237 .mount_apivfs = context->mount_apivfs,
2238 };
2239 bool needs_sandboxing;
2240 BindMount *bind_mounts = NULL;
2241 unsigned n_bind_mounts = 0;
2242 int r;
2243
2244 assert(context);
2245
2246 /* The runtime struct only contains the parent of the private /tmp,
2247 * which is non-accessible to world users. Inside of it there's a /tmp
2248 * that is sticky, and that's the one we want to use here. */
2249
2250 if (context->private_tmp && runtime) {
2251 if (runtime->tmp_dir)
2252 tmp = strjoina(runtime->tmp_dir, "/tmp");
2253 if (runtime->var_tmp_dir)
2254 var = strjoina(runtime->var_tmp_dir, "/tmp");
2255 }
2256
2257 if (params->flags & EXEC_APPLY_CHROOT) {
2258 root_image = context->root_image;
2259
2260 if (!root_image)
2261 root_dir = context->root_directory;
2262 }
2263
2264 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2265 if (r < 0)
2266 return r;
2267
2268 /*
2269 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2270 * sandbox info, otherwise enforce it, don't ignore protected paths and
2271 * fail if we are enable to apply the sandbox inside the mount namespace.
2272 */
2273 if (!context->dynamic_user && root_dir)
2274 ns_info.ignore_protect_paths = true;
2275
2276 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2277
2278 r = setup_namespace(root_dir, root_image,
2279 &ns_info, context->read_write_paths,
2280 needs_sandboxing ? context->read_only_paths : NULL,
2281 needs_sandboxing ? context->inaccessible_paths : NULL,
2282 empty_directories,
2283 bind_mounts,
2284 n_bind_mounts,
2285 tmp,
2286 var,
2287 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2288 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2289 context->mount_flags,
2290 DISSECT_IMAGE_DISCARD_ON_LOOP);
2291
2292 bind_mount_free_many(bind_mounts, n_bind_mounts);
2293
2294 /* If we couldn't set up the namespace this is probably due to a
2295 * missing capability. In this case, silently proceeed. */
2296 if (IN_SET(r, -EPERM, -EACCES)) {
2297 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2298 return 0;
2299 }
2300
2301 return r;
2302 }
2303
2304 static int apply_working_directory(
2305 const ExecContext *context,
2306 const ExecParameters *params,
2307 const char *home,
2308 const bool needs_mount_ns,
2309 int *exit_status) {
2310
2311 const char *d, *wd;
2312
2313 assert(context);
2314 assert(exit_status);
2315
2316 if (context->working_directory_home) {
2317
2318 if (!home) {
2319 *exit_status = EXIT_CHDIR;
2320 return -ENXIO;
2321 }
2322
2323 wd = home;
2324
2325 } else if (context->working_directory)
2326 wd = context->working_directory;
2327 else
2328 wd = "/";
2329
2330 if (params->flags & EXEC_APPLY_CHROOT) {
2331 if (!needs_mount_ns && context->root_directory)
2332 if (chroot(context->root_directory) < 0) {
2333 *exit_status = EXIT_CHROOT;
2334 return -errno;
2335 }
2336
2337 d = wd;
2338 } else
2339 d = prefix_roota(context->root_directory, wd);
2340
2341 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2342 *exit_status = EXIT_CHDIR;
2343 return -errno;
2344 }
2345
2346 return 0;
2347 }
2348
2349 static int setup_keyring(
2350 Unit *u,
2351 const ExecContext *context,
2352 const ExecParameters *p,
2353 uid_t uid, gid_t gid) {
2354
2355 key_serial_t keyring;
2356 int r;
2357
2358 assert(u);
2359 assert(context);
2360 assert(p);
2361
2362 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2363 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2364 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2365 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2366 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2367 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2368
2369 if (!(p->flags & EXEC_NEW_KEYRING))
2370 return 0;
2371
2372 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2373 return 0;
2374
2375 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2376 if (keyring == -1) {
2377 if (errno == ENOSYS)
2378 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2379 else if (IN_SET(errno, EACCES, EPERM))
2380 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2381 else if (errno == EDQUOT)
2382 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2383 else
2384 return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2385
2386 return 0;
2387 }
2388
2389 /* Populate they keyring with the invocation ID by default. */
2390 if (!sd_id128_is_null(u->invocation_id)) {
2391 key_serial_t key;
2392
2393 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2394 if (key == -1)
2395 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2396 else {
2397 if (keyctl(KEYCTL_SETPERM, key,
2398 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2399 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2400 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2401 }
2402 }
2403
2404 /* And now, make the keyring owned by the service's user */
2405 if (uid_is_valid(uid) || gid_is_valid(gid))
2406 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2407 return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2408
2409 /* When requested link the user keyring into the session keyring. */
2410 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2411 uid_t saved_uid;
2412 gid_t saved_gid;
2413
2414 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2415 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2416 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2417
2418 saved_uid = getuid();
2419 saved_gid = getgid();
2420
2421 if (gid_is_valid(gid) && gid != saved_gid) {
2422 if (setregid(gid, -1) < 0)
2423 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2424 }
2425
2426 if (uid_is_valid(uid) && uid != saved_uid) {
2427 if (setreuid(uid, -1) < 0) {
2428 (void) setregid(saved_gid, -1);
2429 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2430 }
2431 }
2432
2433 if (keyctl(KEYCTL_LINK,
2434 KEY_SPEC_USER_KEYRING,
2435 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2436
2437 r = -errno;
2438
2439 (void) setreuid(saved_uid, -1);
2440 (void) setregid(saved_gid, -1);
2441
2442 return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2443 }
2444
2445 if (uid_is_valid(uid) && uid != saved_uid) {
2446 if (setreuid(saved_uid, -1) < 0) {
2447 (void) setregid(saved_gid, -1);
2448 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2449 }
2450 }
2451
2452 if (gid_is_valid(gid) && gid != saved_gid) {
2453 if (setregid(saved_gid, -1) < 0)
2454 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2455 }
2456 }
2457
2458 return 0;
2459 }
2460
2461 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2462 assert(array);
2463 assert(n);
2464
2465 if (!pair)
2466 return;
2467
2468 if (pair[0] >= 0)
2469 array[(*n)++] = pair[0];
2470 if (pair[1] >= 0)
2471 array[(*n)++] = pair[1];
2472 }
2473
2474 static int close_remaining_fds(
2475 const ExecParameters *params,
2476 ExecRuntime *runtime,
2477 DynamicCreds *dcreds,
2478 int user_lookup_fd,
2479 int socket_fd,
2480 int *fds, unsigned n_fds) {
2481
2482 unsigned n_dont_close = 0;
2483 int dont_close[n_fds + 12];
2484
2485 assert(params);
2486
2487 if (params->stdin_fd >= 0)
2488 dont_close[n_dont_close++] = params->stdin_fd;
2489 if (params->stdout_fd >= 0)
2490 dont_close[n_dont_close++] = params->stdout_fd;
2491 if (params->stderr_fd >= 0)
2492 dont_close[n_dont_close++] = params->stderr_fd;
2493
2494 if (socket_fd >= 0)
2495 dont_close[n_dont_close++] = socket_fd;
2496 if (n_fds > 0) {
2497 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2498 n_dont_close += n_fds;
2499 }
2500
2501 if (runtime)
2502 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2503
2504 if (dcreds) {
2505 if (dcreds->user)
2506 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2507 if (dcreds->group)
2508 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2509 }
2510
2511 if (user_lookup_fd >= 0)
2512 dont_close[n_dont_close++] = user_lookup_fd;
2513
2514 return close_all_fds(dont_close, n_dont_close);
2515 }
2516
2517 static int send_user_lookup(
2518 Unit *unit,
2519 int user_lookup_fd,
2520 uid_t uid,
2521 gid_t gid) {
2522
2523 assert(unit);
2524
2525 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2526 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2527 * specified. */
2528
2529 if (user_lookup_fd < 0)
2530 return 0;
2531
2532 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2533 return 0;
2534
2535 if (writev(user_lookup_fd,
2536 (struct iovec[]) {
2537 IOVEC_INIT(&uid, sizeof(uid)),
2538 IOVEC_INIT(&gid, sizeof(gid)),
2539 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2540 return -errno;
2541
2542 return 0;
2543 }
2544
2545 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2546 int r;
2547
2548 assert(c);
2549 assert(home);
2550 assert(buf);
2551
2552 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2553
2554 if (*home)
2555 return 0;
2556
2557 if (!c->working_directory_home)
2558 return 0;
2559
2560 if (uid == 0) {
2561 /* Hardcode /root as home directory for UID 0 */
2562 *home = "/root";
2563 return 1;
2564 }
2565
2566 r = get_home_dir(buf);
2567 if (r < 0)
2568 return r;
2569
2570 *home = *buf;
2571 return 1;
2572 }
2573
2574 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2575 _cleanup_strv_free_ char ** list = NULL;
2576 ExecDirectoryType t;
2577 int r;
2578
2579 assert(c);
2580 assert(p);
2581 assert(ret);
2582
2583 assert(c->dynamic_user);
2584
2585 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2586 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2587 * directories. */
2588
2589 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2590 char **i;
2591
2592 if (t == EXEC_DIRECTORY_CONFIGURATION)
2593 continue;
2594
2595 if (!p->prefix[t])
2596 continue;
2597
2598 STRV_FOREACH(i, c->directories[t].paths) {
2599 char *e;
2600
2601 if (t == EXEC_DIRECTORY_RUNTIME)
2602 e = strjoin(p->prefix[t], "/", *i);
2603 else
2604 e = strjoin(p->prefix[t], "/private/", *i);
2605 if (!e)
2606 return -ENOMEM;
2607
2608 r = strv_consume(&list, e);
2609 if (r < 0)
2610 return r;
2611 }
2612 }
2613
2614 *ret = list;
2615 list = NULL;
2616
2617 return 0;
2618 }
2619
2620 static int exec_child(
2621 Unit *unit,
2622 ExecCommand *command,
2623 const ExecContext *context,
2624 const ExecParameters *params,
2625 ExecRuntime *runtime,
2626 DynamicCreds *dcreds,
2627 char **argv,
2628 int socket_fd,
2629 int named_iofds[3],
2630 int *fds,
2631 unsigned n_storage_fds,
2632 unsigned n_socket_fds,
2633 char **files_env,
2634 int user_lookup_fd,
2635 int *exit_status) {
2636
2637 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2638 _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2639 _cleanup_free_ gid_t *supplementary_gids = NULL;
2640 const char *username = NULL, *groupname = NULL;
2641 const char *home = NULL, *shell = NULL;
2642 dev_t journal_stream_dev = 0;
2643 ino_t journal_stream_ino = 0;
2644 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2645 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2646 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2647 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2648 #if HAVE_SELINUX
2649 bool use_selinux = false;
2650 #endif
2651 #if ENABLE_SMACK
2652 bool use_smack = false;
2653 #endif
2654 #if HAVE_APPARMOR
2655 bool use_apparmor = false;
2656 #endif
2657 uid_t uid = UID_INVALID;
2658 gid_t gid = GID_INVALID;
2659 int i, r, ngids = 0;
2660 unsigned n_fds;
2661 ExecDirectoryType dt;
2662 int secure_bits;
2663
2664 assert(unit);
2665 assert(command);
2666 assert(context);
2667 assert(params);
2668 assert(exit_status);
2669
2670 rename_process_from_path(command->path);
2671
2672 /* We reset exactly these signals, since they are the
2673 * only ones we set to SIG_IGN in the main daemon. All
2674 * others we leave untouched because we set them to
2675 * SIG_DFL or a valid handler initially, both of which
2676 * will be demoted to SIG_DFL. */
2677 (void) default_signals(SIGNALS_CRASH_HANDLER,
2678 SIGNALS_IGNORE, -1);
2679
2680 if (context->ignore_sigpipe)
2681 (void) ignore_signals(SIGPIPE, -1);
2682
2683 r = reset_signal_mask();
2684 if (r < 0) {
2685 *exit_status = EXIT_SIGNAL_MASK;
2686 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2687 }
2688
2689 if (params->idle_pipe)
2690 do_idle_pipe_dance(params->idle_pipe);
2691
2692 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2693 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2694 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2695 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2696
2697 log_forget_fds();
2698 log_set_open_when_needed(true);
2699
2700 /* In case anything used libc syslog(), close this here, too */
2701 closelog();
2702
2703 n_fds = n_storage_fds + n_socket_fds;
2704 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2705 if (r < 0) {
2706 *exit_status = EXIT_FDS;
2707 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2708 }
2709
2710 if (!context->same_pgrp)
2711 if (setsid() < 0) {
2712 *exit_status = EXIT_SETSID;
2713 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2714 }
2715
2716 exec_context_tty_reset(context, params);
2717
2718 if (unit_shall_confirm_spawn(unit)) {
2719 const char *vc = params->confirm_spawn;
2720 _cleanup_free_ char *cmdline = NULL;
2721
2722 cmdline = exec_command_line(argv);
2723 if (!cmdline) {
2724 *exit_status = EXIT_MEMORY;
2725 return log_oom();
2726 }
2727
2728 r = ask_for_confirmation(vc, unit, cmdline);
2729 if (r != CONFIRM_EXECUTE) {
2730 if (r == CONFIRM_PRETEND_SUCCESS) {
2731 *exit_status = EXIT_SUCCESS;
2732 return 0;
2733 }
2734 *exit_status = EXIT_CONFIRM;
2735 log_unit_error(unit, "Execution cancelled by the user");
2736 return -ECANCELED;
2737 }
2738 }
2739
2740 if (context->dynamic_user && dcreds) {
2741 _cleanup_strv_free_ char **suggested_paths = NULL;
2742
2743 /* Make sure we bypass our own NSS module for any NSS checks */
2744 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2745 *exit_status = EXIT_USER;
2746 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2747 }
2748
2749 r = compile_suggested_paths(context, params, &suggested_paths);
2750 if (r < 0) {
2751 *exit_status = EXIT_MEMORY;
2752 return log_oom();
2753 }
2754
2755 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2756 if (r < 0) {
2757 *exit_status = EXIT_USER;
2758 if (r == -EILSEQ) {
2759 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2760 return -EOPNOTSUPP;
2761 }
2762 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2763 }
2764
2765 if (!uid_is_valid(uid)) {
2766 *exit_status = EXIT_USER;
2767 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2768 return -ESRCH;
2769 }
2770
2771 if (!gid_is_valid(gid)) {
2772 *exit_status = EXIT_USER;
2773 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2774 return -ESRCH;
2775 }
2776
2777 if (dcreds->user)
2778 username = dcreds->user->name;
2779
2780 } else {
2781 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2782 if (r < 0) {
2783 *exit_status = EXIT_USER;
2784 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2785 }
2786
2787 r = get_fixed_group(context, &groupname, &gid);
2788 if (r < 0) {
2789 *exit_status = EXIT_GROUP;
2790 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2791 }
2792 }
2793
2794 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2795 r = get_supplementary_groups(context, username, groupname, gid,
2796 &supplementary_gids, &ngids);
2797 if (r < 0) {
2798 *exit_status = EXIT_GROUP;
2799 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2800 }
2801
2802 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2803 if (r < 0) {
2804 *exit_status = EXIT_USER;
2805 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2806 }
2807
2808 user_lookup_fd = safe_close(user_lookup_fd);
2809
2810 r = acquire_home(context, uid, &home, &home_buffer);
2811 if (r < 0) {
2812 *exit_status = EXIT_CHDIR;
2813 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2814 }
2815
2816 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2817 * must sure to drop O_NONBLOCK */
2818 if (socket_fd >= 0)
2819 (void) fd_nonblock(socket_fd, false);
2820
2821 r = setup_input(context, params, socket_fd, named_iofds);
2822 if (r < 0) {
2823 *exit_status = EXIT_STDIN;
2824 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2825 }
2826
2827 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2828 if (r < 0) {
2829 *exit_status = EXIT_STDOUT;
2830 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2831 }
2832
2833 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2834 if (r < 0) {
2835 *exit_status = EXIT_STDERR;
2836 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2837 }
2838
2839 if (params->cgroup_path) {
2840 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2841 if (r < 0) {
2842 *exit_status = EXIT_CGROUP;
2843 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2844 }
2845 }
2846
2847 if (context->oom_score_adjust_set) {
2848 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2849
2850 /* When we can't make this change due to EPERM, then
2851 * let's silently skip over it. User namespaces
2852 * prohibit write access to this file, and we
2853 * shouldn't trip up over that. */
2854
2855 sprintf(t, "%i", context->oom_score_adjust);
2856 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2857 if (IN_SET(r, -EPERM, -EACCES))
2858 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2859 else if (r < 0) {
2860 *exit_status = EXIT_OOM_ADJUST;
2861 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2862 }
2863 }
2864
2865 if (context->nice_set)
2866 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2867 *exit_status = EXIT_NICE;
2868 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2869 }
2870
2871 if (context->cpu_sched_set) {
2872 struct sched_param param = {
2873 .sched_priority = context->cpu_sched_priority,
2874 };
2875
2876 r = sched_setscheduler(0,
2877 context->cpu_sched_policy |
2878 (context->cpu_sched_reset_on_fork ?
2879 SCHED_RESET_ON_FORK : 0),
2880 &param);
2881 if (r < 0) {
2882 *exit_status = EXIT_SETSCHEDULER;
2883 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2884 }
2885 }
2886
2887 if (context->cpuset)
2888 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2889 *exit_status = EXIT_CPUAFFINITY;
2890 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2891 }
2892
2893 if (context->ioprio_set)
2894 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2895 *exit_status = EXIT_IOPRIO;
2896 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2897 }
2898
2899 if (context->timer_slack_nsec != NSEC_INFINITY)
2900 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2901 *exit_status = EXIT_TIMERSLACK;
2902 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2903 }
2904
2905 if (context->personality != PERSONALITY_INVALID) {
2906 r = safe_personality(context->personality);
2907 if (r < 0) {
2908 *exit_status = EXIT_PERSONALITY;
2909 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2910 }
2911 }
2912
2913 if (context->utmp_id)
2914 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2915 context->tty_path,
2916 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
2917 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2918 USER_PROCESS,
2919 username);
2920
2921 if (context->user) {
2922 r = chown_terminal(STDIN_FILENO, uid);
2923 if (r < 0) {
2924 *exit_status = EXIT_STDIN;
2925 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2926 }
2927 }
2928
2929 /* If delegation is enabled we'll pass ownership of the cgroup
2930 * (but only in systemd's own controller hierarchy!) to the
2931 * user of the new process. */
2932 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2933 r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
2934 if (r < 0) {
2935 *exit_status = EXIT_CGROUP;
2936 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2937 }
2938
2939 r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
2940 if (r < 0) {
2941 *exit_status = EXIT_CGROUP;
2942 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2943 }
2944 }
2945
2946 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2947 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
2948 if (r < 0)
2949 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
2950 }
2951
2952 r = build_environment(
2953 unit,
2954 context,
2955 params,
2956 n_fds,
2957 home,
2958 username,
2959 shell,
2960 journal_stream_dev,
2961 journal_stream_ino,
2962 &our_env);
2963 if (r < 0) {
2964 *exit_status = EXIT_MEMORY;
2965 return log_oom();
2966 }
2967
2968 r = build_pass_environment(context, &pass_env);
2969 if (r < 0) {
2970 *exit_status = EXIT_MEMORY;
2971 return log_oom();
2972 }
2973
2974 accum_env = strv_env_merge(5,
2975 params->environment,
2976 our_env,
2977 pass_env,
2978 context->environment,
2979 files_env,
2980 NULL);
2981 if (!accum_env) {
2982 *exit_status = EXIT_MEMORY;
2983 return log_oom();
2984 }
2985 accum_env = strv_env_clean(accum_env);
2986
2987 (void) umask(context->umask);
2988
2989 r = setup_keyring(unit, context, params, uid, gid);
2990 if (r < 0) {
2991 *exit_status = EXIT_KEYRING;
2992 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
2993 }
2994
2995 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2996 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2997
2998 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2999 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3000
3001 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3002 if (needs_ambient_hack)
3003 needs_setuid = false;
3004 else
3005 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3006
3007 if (needs_sandboxing) {
3008 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3009 * present. The actual MAC context application will happen later, as late as possible, to avoid
3010 * impacting our own code paths. */
3011
3012 #if HAVE_SELINUX
3013 use_selinux = mac_selinux_use();
3014 #endif
3015 #if ENABLE_SMACK
3016 use_smack = mac_smack_use();
3017 #endif
3018 #if HAVE_APPARMOR
3019 use_apparmor = mac_apparmor_use();
3020 #endif
3021 }
3022
3023 if (needs_setuid) {
3024 if (context->pam_name && username) {
3025 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3026 if (r < 0) {
3027 *exit_status = EXIT_PAM;
3028 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3029 }
3030 }
3031 }
3032
3033 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3034 if (ns_type_supported(NAMESPACE_NET)) {
3035 r = setup_netns(runtime->netns_storage_socket);
3036 if (r < 0) {
3037 *exit_status = EXIT_NETWORK;
3038 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3039 }
3040 } else
3041 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3042 }
3043
3044 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3045 if (needs_mount_namespace) {
3046 r = apply_mount_namespace(unit, command, context, params, runtime);
3047 if (r < 0) {
3048 *exit_status = EXIT_NAMESPACE;
3049 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3050 }
3051 }
3052
3053 /* Apply just after mount namespace setup */
3054 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3055 if (r < 0)
3056 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3057
3058 /* Drop groups as early as possbile */
3059 if (needs_setuid) {
3060 r = enforce_groups(gid, supplementary_gids, ngids);
3061 if (r < 0) {
3062 *exit_status = EXIT_GROUP;
3063 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3064 }
3065 }
3066
3067 if (needs_sandboxing) {
3068 #if HAVE_SELINUX
3069 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3070 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3071 if (r < 0) {
3072 *exit_status = EXIT_SELINUX_CONTEXT;
3073 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3074 }
3075 }
3076 #endif
3077
3078 if (context->private_users) {
3079 r = setup_private_users(uid, gid);
3080 if (r < 0) {
3081 *exit_status = EXIT_USER;
3082 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3083 }
3084 }
3085 }
3086
3087 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3088 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3089 * was needed to upload the policy and can now be closed as well. */
3090 r = close_all_fds(fds, n_fds);
3091 if (r >= 0)
3092 r = shift_fds(fds, n_fds);
3093 if (r >= 0)
3094 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3095 if (r < 0) {
3096 *exit_status = EXIT_FDS;
3097 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3098 }
3099
3100 secure_bits = context->secure_bits;
3101
3102 if (needs_sandboxing) {
3103 uint64_t bset;
3104
3105 for (i = 0; i < _RLIMIT_MAX; i++) {
3106
3107 if (!context->rlimit[i])
3108 continue;
3109
3110 r = setrlimit_closest(i, context->rlimit[i]);
3111 if (r < 0) {
3112 *exit_status = EXIT_LIMITS;
3113 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3114 }
3115 }
3116
3117 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3118 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3119 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3120 *exit_status = EXIT_LIMITS;
3121 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3122 }
3123 }
3124
3125 bset = context->capability_bounding_set;
3126 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3127 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3128 * instead of us doing that */
3129 if (needs_ambient_hack)
3130 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3131 (UINT64_C(1) << CAP_SETUID) |
3132 (UINT64_C(1) << CAP_SETGID);
3133
3134 if (!cap_test_all(bset)) {
3135 r = capability_bounding_set_drop(bset, false);
3136 if (r < 0) {
3137 *exit_status = EXIT_CAPABILITIES;
3138 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3139 }
3140 }
3141
3142 /* This is done before enforce_user, but ambient set
3143 * does not survive over setresuid() if keep_caps is not set. */
3144 if (!needs_ambient_hack &&
3145 context->capability_ambient_set != 0) {
3146 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3147 if (r < 0) {
3148 *exit_status = EXIT_CAPABILITIES;
3149 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3150 }
3151 }
3152 }
3153
3154 if (needs_setuid) {
3155 if (context->user) {
3156 r = enforce_user(context, uid);
3157 if (r < 0) {
3158 *exit_status = EXIT_USER;
3159 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3160 }
3161
3162 if (!needs_ambient_hack &&
3163 context->capability_ambient_set != 0) {
3164
3165 /* Fix the ambient capabilities after user change. */
3166 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3167 if (r < 0) {
3168 *exit_status = EXIT_CAPABILITIES;
3169 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3170 }
3171
3172 /* If we were asked to change user and ambient capabilities
3173 * were requested, we had to add keep-caps to the securebits
3174 * so that we would maintain the inherited capability set
3175 * through the setresuid(). Make sure that the bit is added
3176 * also to the context secure_bits so that we don't try to
3177 * drop the bit away next. */
3178
3179 secure_bits |= 1<<SECURE_KEEP_CAPS;
3180 }
3181 }
3182 }
3183
3184 if (needs_sandboxing) {
3185 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3186 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3187 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3188 * are restricted. */
3189
3190 #if HAVE_SELINUX
3191 if (use_selinux) {
3192 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3193
3194 if (exec_context) {
3195 r = setexeccon(exec_context);
3196 if (r < 0) {
3197 *exit_status = EXIT_SELINUX_CONTEXT;
3198 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3199 }
3200 }
3201 }
3202 #endif
3203
3204 #if ENABLE_SMACK
3205 if (use_smack) {
3206 r = setup_smack(context, command);
3207 if (r < 0) {
3208 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3209 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3210 }
3211 }
3212 #endif
3213
3214 #if HAVE_APPARMOR
3215 if (use_apparmor && context->apparmor_profile) {
3216 r = aa_change_onexec(context->apparmor_profile);
3217 if (r < 0 && !context->apparmor_profile_ignore) {
3218 *exit_status = EXIT_APPARMOR_PROFILE;
3219 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3220 }
3221 }
3222 #endif
3223
3224 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3225 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3226 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3227 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3228 *exit_status = EXIT_SECUREBITS;
3229 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3230 }
3231
3232 if (context_has_no_new_privileges(context))
3233 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3234 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3235 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3236 }
3237
3238 #if HAVE_SECCOMP
3239 r = apply_address_families(unit, context);
3240 if (r < 0) {
3241 *exit_status = EXIT_ADDRESS_FAMILIES;
3242 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3243 }
3244
3245 r = apply_memory_deny_write_execute(unit, context);
3246 if (r < 0) {
3247 *exit_status = EXIT_SECCOMP;
3248 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3249 }
3250
3251 r = apply_restrict_realtime(unit, context);
3252 if (r < 0) {
3253 *exit_status = EXIT_SECCOMP;
3254 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3255 }
3256
3257 r = apply_restrict_namespaces(unit, context);
3258 if (r < 0) {
3259 *exit_status = EXIT_SECCOMP;
3260 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3261 }
3262
3263 r = apply_protect_sysctl(unit, context);
3264 if (r < 0) {
3265 *exit_status = EXIT_SECCOMP;
3266 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3267 }
3268
3269 r = apply_protect_kernel_modules(unit, context);
3270 if (r < 0) {
3271 *exit_status = EXIT_SECCOMP;
3272 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3273 }
3274
3275 r = apply_private_devices(unit, context);
3276 if (r < 0) {
3277 *exit_status = EXIT_SECCOMP;
3278 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3279 }
3280
3281 r = apply_syscall_archs(unit, context);
3282 if (r < 0) {
3283 *exit_status = EXIT_SECCOMP;
3284 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3285 }
3286
3287 r = apply_lock_personality(unit, context);
3288 if (r < 0) {
3289 *exit_status = EXIT_SECCOMP;
3290 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3291 }
3292
3293 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3294 * by the filter as little as possible. */
3295 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3296 if (r < 0) {
3297 *exit_status = EXIT_SECCOMP;
3298 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3299 }
3300 #endif
3301 }
3302
3303 if (!strv_isempty(context->unset_environment)) {
3304 char **ee = NULL;
3305
3306 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3307 if (!ee) {
3308 *exit_status = EXIT_MEMORY;
3309 return log_oom();
3310 }
3311
3312 strv_free(accum_env);
3313 accum_env = ee;
3314 }
3315
3316 final_argv = replace_env_argv(argv, accum_env);
3317 if (!final_argv) {
3318 *exit_status = EXIT_MEMORY;
3319 return log_oom();
3320 }
3321
3322 if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3323 _cleanup_free_ char *line;
3324
3325 line = exec_command_line(final_argv);
3326 if (line) {
3327 log_struct(LOG_DEBUG,
3328 "EXECUTABLE=%s", command->path,
3329 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3330 LOG_UNIT_ID(unit),
3331 LOG_UNIT_INVOCATION_ID(unit),
3332 NULL);
3333 }
3334 }
3335
3336 execve(command->path, final_argv, accum_env);
3337
3338 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3339
3340 log_struct_errno(LOG_INFO, errno,
3341 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3342 LOG_UNIT_ID(unit),
3343 LOG_UNIT_INVOCATION_ID(unit),
3344 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3345 command->path),
3346 "EXECUTABLE=%s", command->path,
3347 NULL);
3348
3349 return 0;
3350 }
3351
3352 *exit_status = EXIT_EXEC;
3353 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3354 }
3355
3356 int exec_spawn(Unit *unit,
3357 ExecCommand *command,
3358 const ExecContext *context,
3359 const ExecParameters *params,
3360 ExecRuntime *runtime,
3361 DynamicCreds *dcreds,
3362 pid_t *ret) {
3363
3364 _cleanup_strv_free_ char **files_env = NULL;
3365 int *fds = NULL;
3366 unsigned n_storage_fds = 0, n_socket_fds = 0;
3367 _cleanup_free_ char *line = NULL;
3368 int socket_fd, r;
3369 int named_iofds[3] = { -1, -1, -1 };
3370 char **argv;
3371 pid_t pid;
3372
3373 assert(unit);
3374 assert(command);
3375 assert(context);
3376 assert(ret);
3377 assert(params);
3378 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3379
3380 if (context->std_input == EXEC_INPUT_SOCKET ||
3381 context->std_output == EXEC_OUTPUT_SOCKET ||
3382 context->std_error == EXEC_OUTPUT_SOCKET) {
3383
3384 if (params->n_socket_fds > 1) {
3385 log_unit_error(unit, "Got more than one socket.");
3386 return -EINVAL;
3387 }
3388
3389 if (params->n_socket_fds == 0) {
3390 log_unit_error(unit, "Got no socket.");
3391 return -EINVAL;
3392 }
3393
3394 socket_fd = params->fds[0];
3395 } else {
3396 socket_fd = -1;
3397 fds = params->fds;
3398 n_storage_fds = params->n_storage_fds;
3399 n_socket_fds = params->n_socket_fds;
3400 }
3401
3402 r = exec_context_named_iofds(unit, context, params, named_iofds);
3403 if (r < 0)
3404 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3405
3406 r = exec_context_load_environment(unit, context, &files_env);
3407 if (r < 0)
3408 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3409
3410 argv = params->argv ?: command->argv;
3411 line = exec_command_line(argv);
3412 if (!line)
3413 return log_oom();
3414
3415 log_struct(LOG_DEBUG,
3416 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3417 "EXECUTABLE=%s", command->path,
3418 LOG_UNIT_ID(unit),
3419 LOG_UNIT_INVOCATION_ID(unit),
3420 NULL);
3421
3422 pid = fork();
3423 if (pid < 0)
3424 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3425
3426 if (pid == 0) {
3427 int exit_status = EXIT_SUCCESS;
3428
3429 r = exec_child(unit,
3430 command,
3431 context,
3432 params,
3433 runtime,
3434 dcreds,
3435 argv,
3436 socket_fd,
3437 named_iofds,
3438 fds,
3439 n_storage_fds,
3440 n_socket_fds,
3441 files_env,
3442 unit->manager->user_lookup_fds[1],
3443 &exit_status);
3444
3445 if (r < 0) {
3446 log_struct_errno(LOG_ERR, r,
3447 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3448 LOG_UNIT_ID(unit),
3449 LOG_UNIT_INVOCATION_ID(unit),
3450 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3451 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3452 command->path),
3453 "EXECUTABLE=%s", command->path,
3454 NULL);
3455 }
3456
3457 _exit(exit_status);
3458 }
3459
3460 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3461
3462 /* We add the new process to the cgroup both in the child (so
3463 * that we can be sure that no user code is ever executed
3464 * outside of the cgroup) and in the parent (so that we can be
3465 * sure that when we kill the cgroup the process will be
3466 * killed too). */
3467 if (params->cgroup_path)
3468 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3469
3470 exec_status_start(&command->exec_status, pid);
3471
3472 *ret = pid;
3473 return 0;
3474 }
3475
3476 void exec_context_init(ExecContext *c) {
3477 ExecDirectoryType i;
3478
3479 assert(c);
3480
3481 c->umask = 0022;
3482 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3483 c->cpu_sched_policy = SCHED_OTHER;
3484 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3485 c->syslog_level_prefix = true;
3486 c->ignore_sigpipe = true;
3487 c->timer_slack_nsec = NSEC_INFINITY;
3488 c->personality = PERSONALITY_INVALID;
3489 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3490 c->directories[i].mode = 0755;
3491 c->capability_bounding_set = CAP_ALL;
3492 c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3493 c->log_level_max = -1;
3494 }
3495
3496 void exec_context_done(ExecContext *c) {
3497 ExecDirectoryType i;
3498 size_t l;
3499
3500 assert(c);
3501
3502 c->environment = strv_free(c->environment);
3503 c->environment_files = strv_free(c->environment_files);
3504 c->pass_environment = strv_free(c->pass_environment);
3505 c->unset_environment = strv_free(c->unset_environment);
3506
3507 for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3508 c->rlimit[l] = mfree(c->rlimit[l]);
3509
3510 for (l = 0; l < 3; l++)
3511 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3512
3513 c->working_directory = mfree(c->working_directory);
3514 c->root_directory = mfree(c->root_directory);
3515 c->root_image = mfree(c->root_image);
3516 c->tty_path = mfree(c->tty_path);
3517 c->syslog_identifier = mfree(c->syslog_identifier);
3518 c->user = mfree(c->user);
3519 c->group = mfree(c->group);
3520
3521 c->supplementary_groups = strv_free(c->supplementary_groups);
3522
3523 c->pam_name = mfree(c->pam_name);
3524
3525 c->read_only_paths = strv_free(c->read_only_paths);
3526 c->read_write_paths = strv_free(c->read_write_paths);
3527 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3528
3529 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3530
3531 if (c->cpuset)
3532 CPU_FREE(c->cpuset);
3533
3534 c->utmp_id = mfree(c->utmp_id);
3535 c->selinux_context = mfree(c->selinux_context);
3536 c->apparmor_profile = mfree(c->apparmor_profile);
3537 c->smack_process_label = mfree(c->smack_process_label);
3538
3539 c->syscall_filter = hashmap_free(c->syscall_filter);
3540 c->syscall_archs = set_free(c->syscall_archs);
3541 c->address_families = set_free(c->address_families);
3542
3543 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3544 c->directories[i].paths = strv_free(c->directories[i].paths);
3545
3546 c->log_level_max = -1;
3547
3548 exec_context_free_log_extra_fields(c);
3549
3550 c->stdin_data = mfree(c->stdin_data);
3551 c->stdin_data_size = 0;
3552 }
3553
3554 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3555 char **i;
3556
3557 assert(c);
3558
3559 if (!runtime_prefix)
3560 return 0;
3561
3562 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3563 _cleanup_free_ char *p;
3564
3565 p = strjoin(runtime_prefix, "/", *i);
3566 if (!p)
3567 return -ENOMEM;
3568
3569 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3570 * next. */
3571 (void) rm_rf(p, REMOVE_ROOT);
3572 }
3573
3574 return 0;
3575 }
3576
3577 void exec_command_done(ExecCommand *c) {
3578 assert(c);
3579
3580 c->path = mfree(c->path);
3581
3582 c->argv = strv_free(c->argv);
3583 }
3584
3585 void exec_command_done_array(ExecCommand *c, unsigned n) {
3586 unsigned i;
3587
3588 for (i = 0; i < n; i++)
3589 exec_command_done(c+i);
3590 }
3591
3592 ExecCommand* exec_command_free_list(ExecCommand *c) {
3593 ExecCommand *i;
3594
3595 while ((i = c)) {
3596 LIST_REMOVE(command, c, i);
3597 exec_command_done(i);
3598 free(i);
3599 }
3600
3601 return NULL;
3602 }
3603
3604 void exec_command_free_array(ExecCommand **c, unsigned n) {
3605 unsigned i;
3606
3607 for (i = 0; i < n; i++)
3608 c[i] = exec_command_free_list(c[i]);
3609 }
3610
3611 typedef struct InvalidEnvInfo {
3612 Unit *unit;
3613 const char *path;
3614 } InvalidEnvInfo;
3615
3616 static void invalid_env(const char *p, void *userdata) {
3617 InvalidEnvInfo *info = userdata;
3618
3619 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3620 }
3621
3622 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3623 assert(c);
3624
3625 switch (fd_index) {
3626
3627 case STDIN_FILENO:
3628 if (c->std_input != EXEC_INPUT_NAMED_FD)
3629 return NULL;
3630
3631 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3632
3633 case STDOUT_FILENO:
3634 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3635 return NULL;
3636
3637 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3638
3639 case STDERR_FILENO:
3640 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3641 return NULL;
3642
3643 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3644
3645 default:
3646 return NULL;
3647 }
3648 }
3649
3650 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3651 unsigned i, targets;
3652 const char* stdio_fdname[3];
3653 unsigned n_fds;
3654
3655 assert(c);
3656 assert(p);
3657
3658 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3659 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3660 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3661
3662 for (i = 0; i < 3; i++)
3663 stdio_fdname[i] = exec_context_fdname(c, i);
3664
3665 n_fds = p->n_storage_fds + p->n_socket_fds;
3666
3667 for (i = 0; i < n_fds && targets > 0; i++)
3668 if (named_iofds[STDIN_FILENO] < 0 &&
3669 c->std_input == EXEC_INPUT_NAMED_FD &&
3670 stdio_fdname[STDIN_FILENO] &&
3671 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3672
3673 named_iofds[STDIN_FILENO] = p->fds[i];
3674 targets--;
3675
3676 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3677 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3678 stdio_fdname[STDOUT_FILENO] &&
3679 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3680
3681 named_iofds[STDOUT_FILENO] = p->fds[i];
3682 targets--;
3683
3684 } else if (named_iofds[STDERR_FILENO] < 0 &&
3685 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3686 stdio_fdname[STDERR_FILENO] &&
3687 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3688
3689 named_iofds[STDERR_FILENO] = p->fds[i];
3690 targets--;
3691 }
3692
3693 return targets == 0 ? 0 : -ENOENT;
3694 }
3695
3696 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3697 char **i, **r = NULL;
3698
3699 assert(c);
3700 assert(l);
3701
3702 STRV_FOREACH(i, c->environment_files) {
3703 char *fn;
3704 int k;
3705 unsigned n;
3706 bool ignore = false;
3707 char **p;
3708 _cleanup_globfree_ glob_t pglob = {};
3709
3710 fn = *i;
3711
3712 if (fn[0] == '-') {
3713 ignore = true;
3714 fn++;
3715 }
3716
3717 if (!path_is_absolute(fn)) {
3718 if (ignore)
3719 continue;
3720
3721 strv_free(r);
3722 return -EINVAL;
3723 }
3724
3725 /* Filename supports globbing, take all matching files */
3726 k = safe_glob(fn, 0, &pglob);
3727 if (k < 0) {
3728 if (ignore)
3729 continue;
3730
3731 strv_free(r);
3732 return k;
3733 }
3734
3735 /* When we don't match anything, -ENOENT should be returned */
3736 assert(pglob.gl_pathc > 0);
3737
3738 for (n = 0; n < pglob.gl_pathc; n++) {
3739 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3740 if (k < 0) {
3741 if (ignore)
3742 continue;
3743
3744 strv_free(r);
3745 return k;
3746 }
3747 /* Log invalid environment variables with filename */
3748 if (p) {
3749 InvalidEnvInfo info = {
3750 .unit = unit,
3751 .path = pglob.gl_pathv[n]
3752 };
3753
3754 p = strv_env_clean_with_callback(p, invalid_env, &info);
3755 }
3756
3757 if (r == NULL)
3758 r = p;
3759 else {
3760 char **m;
3761
3762 m = strv_env_merge(2, r, p);
3763 strv_free(r);
3764 strv_free(p);
3765 if (!m)
3766 return -ENOMEM;
3767
3768 r = m;
3769 }
3770 }
3771 }
3772
3773 *l = r;
3774
3775 return 0;
3776 }
3777
3778 static bool tty_may_match_dev_console(const char *tty) {
3779 _cleanup_free_ char *active = NULL;
3780 char *console;
3781
3782 if (!tty)
3783 return true;
3784
3785 tty = skip_dev_prefix(tty);
3786
3787 /* trivial identity? */
3788 if (streq(tty, "console"))
3789 return true;
3790
3791 console = resolve_dev_console(&active);
3792 /* if we could not resolve, assume it may */
3793 if (!console)
3794 return true;
3795
3796 /* "tty0" means the active VC, so it may be the same sometimes */
3797 return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3798 }
3799
3800 bool exec_context_may_touch_console(ExecContext *ec) {
3801
3802 return (ec->tty_reset ||
3803 ec->tty_vhangup ||
3804 ec->tty_vt_disallocate ||
3805 is_terminal_input(ec->std_input) ||
3806 is_terminal_output(ec->std_output) ||
3807 is_terminal_output(ec->std_error)) &&
3808 tty_may_match_dev_console(exec_context_tty_path(ec));
3809 }
3810
3811 static void strv_fprintf(FILE *f, char **l) {
3812 char **g;
3813
3814 assert(f);
3815
3816 STRV_FOREACH(g, l)
3817 fprintf(f, " %s", *g);
3818 }
3819
3820 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3821 ExecDirectoryType dt;
3822 char **e, **d;
3823 unsigned i;
3824 int r;
3825
3826 assert(c);
3827 assert(f);
3828
3829 prefix = strempty(prefix);
3830
3831 fprintf(f,
3832 "%sUMask: %04o\n"
3833 "%sWorkingDirectory: %s\n"
3834 "%sRootDirectory: %s\n"
3835 "%sNonBlocking: %s\n"
3836 "%sPrivateTmp: %s\n"
3837 "%sPrivateDevices: %s\n"
3838 "%sProtectKernelTunables: %s\n"
3839 "%sProtectKernelModules: %s\n"
3840 "%sProtectControlGroups: %s\n"
3841 "%sPrivateNetwork: %s\n"
3842 "%sPrivateUsers: %s\n"
3843 "%sProtectHome: %s\n"
3844 "%sProtectSystem: %s\n"
3845 "%sMountAPIVFS: %s\n"
3846 "%sIgnoreSIGPIPE: %s\n"
3847 "%sMemoryDenyWriteExecute: %s\n"
3848 "%sRestrictRealtime: %s\n"
3849 "%sKeyringMode: %s\n",
3850 prefix, c->umask,
3851 prefix, c->working_directory ? c->working_directory : "/",
3852 prefix, c->root_directory ? c->root_directory : "/",
3853 prefix, yes_no(c->non_blocking),
3854 prefix, yes_no(c->private_tmp),
3855 prefix, yes_no(c->private_devices),
3856 prefix, yes_no(c->protect_kernel_tunables),
3857 prefix, yes_no(c->protect_kernel_modules),
3858 prefix, yes_no(c->protect_control_groups),
3859 prefix, yes_no(c->private_network),
3860 prefix, yes_no(c->private_users),
3861 prefix, protect_home_to_string(c->protect_home),
3862 prefix, protect_system_to_string(c->protect_system),
3863 prefix, yes_no(c->mount_apivfs),
3864 prefix, yes_no(c->ignore_sigpipe),
3865 prefix, yes_no(c->memory_deny_write_execute),
3866 prefix, yes_no(c->restrict_realtime),
3867 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3868
3869 if (c->root_image)
3870 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3871
3872 STRV_FOREACH(e, c->environment)
3873 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3874
3875 STRV_FOREACH(e, c->environment_files)
3876 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3877
3878 STRV_FOREACH(e, c->pass_environment)
3879 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3880
3881 STRV_FOREACH(e, c->unset_environment)
3882 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3883
3884 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3885
3886 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3887 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3888
3889 STRV_FOREACH(d, c->directories[dt].paths)
3890 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3891 }
3892
3893 if (c->nice_set)
3894 fprintf(f,
3895 "%sNice: %i\n",
3896 prefix, c->nice);
3897
3898 if (c->oom_score_adjust_set)
3899 fprintf(f,
3900 "%sOOMScoreAdjust: %i\n",
3901 prefix, c->oom_score_adjust);
3902
3903 for (i = 0; i < RLIM_NLIMITS; i++)
3904 if (c->rlimit[i]) {
3905 fprintf(f, "%s%s: " RLIM_FMT "\n",
3906 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3907 fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3908 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3909 }
3910
3911 if (c->ioprio_set) {
3912 _cleanup_free_ char *class_str = NULL;
3913
3914 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3915 if (r >= 0)
3916 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3917
3918 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3919 }
3920
3921 if (c->cpu_sched_set) {
3922 _cleanup_free_ char *policy_str = NULL;
3923
3924 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3925 if (r >= 0)
3926 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3927
3928 fprintf(f,
3929 "%sCPUSchedulingPriority: %i\n"
3930 "%sCPUSchedulingResetOnFork: %s\n",
3931 prefix, c->cpu_sched_priority,
3932 prefix, yes_no(c->cpu_sched_reset_on_fork));
3933 }
3934
3935 if (c->cpuset) {
3936 fprintf(f, "%sCPUAffinity:", prefix);
3937 for (i = 0; i < c->cpuset_ncpus; i++)
3938 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
3939 fprintf(f, " %u", i);
3940 fputs("\n", f);
3941 }
3942
3943 if (c->timer_slack_nsec != NSEC_INFINITY)
3944 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
3945
3946 fprintf(f,
3947 "%sStandardInput: %s\n"
3948 "%sStandardOutput: %s\n"
3949 "%sStandardError: %s\n",
3950 prefix, exec_input_to_string(c->std_input),
3951 prefix, exec_output_to_string(c->std_output),
3952 prefix, exec_output_to_string(c->std_error));
3953
3954 if (c->tty_path)
3955 fprintf(f,
3956 "%sTTYPath: %s\n"
3957 "%sTTYReset: %s\n"
3958 "%sTTYVHangup: %s\n"
3959 "%sTTYVTDisallocate: %s\n",
3960 prefix, c->tty_path,
3961 prefix, yes_no(c->tty_reset),
3962 prefix, yes_no(c->tty_vhangup),
3963 prefix, yes_no(c->tty_vt_disallocate));
3964
3965 if (IN_SET(c->std_output,
3966 EXEC_OUTPUT_SYSLOG,
3967 EXEC_OUTPUT_KMSG,
3968 EXEC_OUTPUT_JOURNAL,
3969 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3970 EXEC_OUTPUT_KMSG_AND_CONSOLE,
3971 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
3972 IN_SET(c->std_error,
3973 EXEC_OUTPUT_SYSLOG,
3974 EXEC_OUTPUT_KMSG,
3975 EXEC_OUTPUT_JOURNAL,
3976 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3977 EXEC_OUTPUT_KMSG_AND_CONSOLE,
3978 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
3979
3980 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
3981
3982 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
3983 if (r >= 0)
3984 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
3985
3986 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
3987 if (r >= 0)
3988 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
3989 }
3990
3991 if (c->log_level_max >= 0) {
3992 _cleanup_free_ char *t = NULL;
3993
3994 (void) log_level_to_string_alloc(c->log_level_max, &t);
3995
3996 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
3997 }
3998
3999 if (c->n_log_extra_fields > 0) {
4000 size_t j;
4001
4002 for (j = 0; j < c->n_log_extra_fields; j++) {
4003 fprintf(f, "%sLogExtraFields: ", prefix);
4004 fwrite(c->log_extra_fields[j].iov_base,
4005 1, c->log_extra_fields[j].iov_len,
4006 f);
4007 fputc('\n', f);
4008 }
4009 }
4010
4011 if (c->secure_bits) {
4012 _cleanup_free_ char *str = NULL;
4013
4014 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4015 if (r >= 0)
4016 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4017 }
4018
4019 if (c->capability_bounding_set != CAP_ALL) {
4020 _cleanup_free_ char *str = NULL;
4021
4022 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4023 if (r >= 0)
4024 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4025 }
4026
4027 if (c->capability_ambient_set != 0) {
4028 _cleanup_free_ char *str = NULL;
4029
4030 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4031 if (r >= 0)
4032 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4033 }
4034
4035 if (c->user)
4036 fprintf(f, "%sUser: %s\n", prefix, c->user);
4037 if (c->group)
4038 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4039
4040 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4041
4042 if (!strv_isempty(c->supplementary_groups)) {
4043 fprintf(f, "%sSupplementaryGroups:", prefix);
4044 strv_fprintf(f, c->supplementary_groups);
4045 fputs("\n", f);
4046 }
4047
4048 if (c->pam_name)
4049 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4050
4051 if (strv_length(c->read_write_paths) > 0) {
4052 fprintf(f, "%sReadWritePaths:", prefix);
4053 strv_fprintf(f, c->read_write_paths);
4054 fputs("\n", f);
4055 }
4056
4057 if (strv_length(c->read_only_paths) > 0) {
4058 fprintf(f, "%sReadOnlyPaths:", prefix);
4059 strv_fprintf(f, c->read_only_paths);
4060 fputs("\n", f);
4061 }
4062
4063 if (strv_length(c->inaccessible_paths) > 0) {
4064 fprintf(f, "%sInaccessiblePaths:", prefix);
4065 strv_fprintf(f, c->inaccessible_paths);
4066 fputs("\n", f);
4067 }
4068
4069 if (c->n_bind_mounts > 0)
4070 for (i = 0; i < c->n_bind_mounts; i++) {
4071 fprintf(f, "%s%s: %s:%s:%s\n", prefix,
4072 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4073 c->bind_mounts[i].source,
4074 c->bind_mounts[i].destination,
4075 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4076 }
4077
4078 if (c->utmp_id)
4079 fprintf(f,
4080 "%sUtmpIdentifier: %s\n",
4081 prefix, c->utmp_id);
4082
4083 if (c->selinux_context)
4084 fprintf(f,
4085 "%sSELinuxContext: %s%s\n",
4086 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4087
4088 if (c->apparmor_profile)
4089 fprintf(f,
4090 "%sAppArmorProfile: %s%s\n",
4091 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4092
4093 if (c->smack_process_label)
4094 fprintf(f,
4095 "%sSmackProcessLabel: %s%s\n",
4096 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4097
4098 if (c->personality != PERSONALITY_INVALID)
4099 fprintf(f,
4100 "%sPersonality: %s\n",
4101 prefix, strna(personality_to_string(c->personality)));
4102
4103 fprintf(f,
4104 "%sLockPersonality: %s\n",
4105 prefix, yes_no(c->lock_personality));
4106
4107 if (c->syscall_filter) {
4108 #if HAVE_SECCOMP
4109 Iterator j;
4110 void *id, *val;
4111 bool first = true;
4112 #endif
4113
4114 fprintf(f,
4115 "%sSystemCallFilter: ",
4116 prefix);
4117
4118 if (!c->syscall_whitelist)
4119 fputc('~', f);
4120
4121 #if HAVE_SECCOMP
4122 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4123 _cleanup_free_ char *name = NULL;
4124 const char *errno_name = NULL;
4125 int num = PTR_TO_INT(val);
4126
4127 if (first)
4128 first = false;
4129 else
4130 fputc(' ', f);
4131
4132 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4133 fputs(strna(name), f);
4134
4135 if (num >= 0) {
4136 errno_name = errno_to_name(num);
4137 if (errno_name)
4138 fprintf(f, ":%s", errno_name);
4139 else
4140 fprintf(f, ":%d", num);
4141 }
4142 }
4143 #endif
4144
4145 fputc('\n', f);
4146 }
4147
4148 if (c->syscall_archs) {
4149 #if HAVE_SECCOMP
4150 Iterator j;
4151 void *id;
4152 #endif
4153
4154 fprintf(f,
4155 "%sSystemCallArchitectures:",
4156 prefix);
4157
4158 #if HAVE_SECCOMP
4159 SET_FOREACH(id, c->syscall_archs, j)
4160 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4161 #endif
4162 fputc('\n', f);
4163 }
4164
4165 if (exec_context_restrict_namespaces_set(c)) {
4166 _cleanup_free_ char *s = NULL;
4167
4168 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4169 if (r >= 0)
4170 fprintf(f, "%sRestrictNamespaces: %s\n",
4171 prefix, s);
4172 }
4173
4174 if (c->syscall_errno > 0) {
4175 const char *errno_name;
4176
4177 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4178
4179 errno_name = errno_to_name(c->syscall_errno);
4180 if (errno_name)
4181 fprintf(f, "%s\n", errno_name);
4182 else
4183 fprintf(f, "%d\n", c->syscall_errno);
4184 }
4185
4186 if (c->apparmor_profile)
4187 fprintf(f,
4188 "%sAppArmorProfile: %s%s\n",
4189 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4190 }
4191
4192 bool exec_context_maintains_privileges(ExecContext *c) {
4193 assert(c);
4194
4195 /* Returns true if the process forked off would run under
4196 * an unchanged UID or as root. */
4197
4198 if (!c->user)
4199 return true;
4200
4201 if (streq(c->user, "root") || streq(c->user, "0"))
4202 return true;
4203
4204 return false;
4205 }
4206
4207 int exec_context_get_effective_ioprio(ExecContext *c) {
4208 int p;
4209
4210 assert(c);
4211
4212 if (c->ioprio_set)
4213 return c->ioprio;
4214
4215 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4216 if (p < 0)
4217 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4218
4219 return p;
4220 }
4221
4222 void exec_context_free_log_extra_fields(ExecContext *c) {
4223 size_t l;
4224
4225 assert(c);
4226
4227 for (l = 0; l < c->n_log_extra_fields; l++)
4228 free(c->log_extra_fields[l].iov_base);
4229 c->log_extra_fields = mfree(c->log_extra_fields);
4230 c->n_log_extra_fields = 0;
4231 }
4232
4233 void exec_status_start(ExecStatus *s, pid_t pid) {
4234 assert(s);
4235
4236 zero(*s);
4237 s->pid = pid;
4238 dual_timestamp_get(&s->start_timestamp);
4239 }
4240
4241 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4242 assert(s);
4243
4244 if (s->pid && s->pid != pid)
4245 zero(*s);
4246
4247 s->pid = pid;
4248 dual_timestamp_get(&s->exit_timestamp);
4249
4250 s->code = code;
4251 s->status = status;
4252
4253 if (context) {
4254 if (context->utmp_id)
4255 utmp_put_dead_process(context->utmp_id, pid, code, status);
4256
4257 exec_context_tty_reset(context, NULL);
4258 }
4259 }
4260
4261 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4262 char buf[FORMAT_TIMESTAMP_MAX];
4263
4264 assert(s);
4265 assert(f);
4266
4267 if (s->pid <= 0)
4268 return;
4269
4270 prefix = strempty(prefix);
4271
4272 fprintf(f,
4273 "%sPID: "PID_FMT"\n",
4274 prefix, s->pid);
4275
4276 if (dual_timestamp_is_set(&s->start_timestamp))
4277 fprintf(f,
4278 "%sStart Timestamp: %s\n",
4279 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4280
4281 if (dual_timestamp_is_set(&s->exit_timestamp))
4282 fprintf(f,
4283 "%sExit Timestamp: %s\n"
4284 "%sExit Code: %s\n"
4285 "%sExit Status: %i\n",
4286 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4287 prefix, sigchld_code_to_string(s->code),
4288 prefix, s->status);
4289 }
4290
4291 char *exec_command_line(char **argv) {
4292 size_t k;
4293 char *n, *p, **a;
4294 bool first = true;
4295
4296 assert(argv);
4297
4298 k = 1;
4299 STRV_FOREACH(a, argv)
4300 k += strlen(*a)+3;
4301
4302 n = new(char, k);
4303 if (!n)
4304 return NULL;
4305
4306 p = n;
4307 STRV_FOREACH(a, argv) {
4308
4309 if (!first)
4310 *(p++) = ' ';
4311 else
4312 first = false;
4313
4314 if (strpbrk(*a, WHITESPACE)) {
4315 *(p++) = '\'';
4316 p = stpcpy(p, *a);
4317 *(p++) = '\'';
4318 } else
4319 p = stpcpy(p, *a);
4320
4321 }
4322
4323 *p = 0;
4324
4325 /* FIXME: this doesn't really handle arguments that have
4326 * spaces and ticks in them */
4327
4328 return n;
4329 }
4330
4331 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4332 _cleanup_free_ char *cmd = NULL;
4333 const char *prefix2;
4334
4335 assert(c);
4336 assert(f);
4337
4338 prefix = strempty(prefix);
4339 prefix2 = strjoina(prefix, "\t");
4340
4341 cmd = exec_command_line(c->argv);
4342 fprintf(f,
4343 "%sCommand Line: %s\n",
4344 prefix, cmd ? cmd : strerror(ENOMEM));
4345
4346 exec_status_dump(&c->exec_status, f, prefix2);
4347 }
4348
4349 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4350 assert(f);
4351
4352 prefix = strempty(prefix);
4353
4354 LIST_FOREACH(command, c, c)
4355 exec_command_dump(c, f, prefix);
4356 }
4357
4358 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4359 ExecCommand *end;
4360
4361 assert(l);
4362 assert(e);
4363
4364 if (*l) {
4365 /* It's kind of important, that we keep the order here */
4366 LIST_FIND_TAIL(command, *l, end);
4367 LIST_INSERT_AFTER(command, *l, end, e);
4368 } else
4369 *l = e;
4370 }
4371
4372 int exec_command_set(ExecCommand *c, const char *path, ...) {
4373 va_list ap;
4374 char **l, *p;
4375
4376 assert(c);
4377 assert(path);
4378
4379 va_start(ap, path);
4380 l = strv_new_ap(path, ap);
4381 va_end(ap);
4382
4383 if (!l)
4384 return -ENOMEM;
4385
4386 p = strdup(path);
4387 if (!p) {
4388 strv_free(l);
4389 return -ENOMEM;
4390 }
4391
4392 free(c->path);
4393 c->path = p;
4394
4395 strv_free(c->argv);
4396 c->argv = l;
4397
4398 return 0;
4399 }
4400
4401 int exec_command_append(ExecCommand *c, const char *path, ...) {
4402 _cleanup_strv_free_ char **l = NULL;
4403 va_list ap;
4404 int r;
4405
4406 assert(c);
4407 assert(path);
4408
4409 va_start(ap, path);
4410 l = strv_new_ap(path, ap);
4411 va_end(ap);
4412
4413 if (!l)
4414 return -ENOMEM;
4415
4416 r = strv_extend_strv(&c->argv, l, false);
4417 if (r < 0)
4418 return r;
4419
4420 return 0;
4421 }
4422
4423
4424 static int exec_runtime_allocate(ExecRuntime **rt) {
4425
4426 if (*rt)
4427 return 0;
4428
4429 *rt = new0(ExecRuntime, 1);
4430 if (!*rt)
4431 return -ENOMEM;
4432
4433 (*rt)->n_ref = 1;
4434 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4435
4436 return 0;
4437 }
4438
4439 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4440 int r;
4441
4442 assert(rt);
4443 assert(c);
4444 assert(id);
4445
4446 if (*rt)
4447 return 1;
4448
4449 if (!c->private_network && !c->private_tmp)
4450 return 0;
4451
4452 r = exec_runtime_allocate(rt);
4453 if (r < 0)
4454 return r;
4455
4456 if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4457 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4458 return -errno;
4459 }
4460
4461 if (c->private_tmp && !(*rt)->tmp_dir) {
4462 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4463 if (r < 0)
4464 return r;
4465 }
4466
4467 return 1;
4468 }
4469
4470 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4471 assert(r);
4472 assert(r->n_ref > 0);
4473
4474 r->n_ref++;
4475 return r;
4476 }
4477
4478 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4479
4480 if (!r)
4481 return NULL;
4482
4483 assert(r->n_ref > 0);
4484
4485 r->n_ref--;
4486 if (r->n_ref > 0)
4487 return NULL;
4488
4489 free(r->tmp_dir);
4490 free(r->var_tmp_dir);
4491 safe_close_pair(r->netns_storage_socket);
4492 return mfree(r);
4493 }
4494
4495 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4496 assert(u);
4497 assert(f);
4498 assert(fds);
4499
4500 if (!rt)
4501 return 0;
4502
4503 if (rt->tmp_dir)
4504 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4505
4506 if (rt->var_tmp_dir)
4507 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4508
4509 if (rt->netns_storage_socket[0] >= 0) {
4510 int copy;
4511
4512 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4513 if (copy < 0)
4514 return copy;
4515
4516 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4517 }
4518
4519 if (rt->netns_storage_socket[1] >= 0) {
4520 int copy;
4521
4522 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4523 if (copy < 0)
4524 return copy;
4525
4526 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4527 }
4528
4529 return 0;
4530 }
4531
4532 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4533 int r;
4534
4535 assert(rt);
4536 assert(key);
4537 assert(value);
4538
4539 if (streq(key, "tmp-dir")) {
4540 char *copy;
4541
4542 r = exec_runtime_allocate(rt);
4543 if (r < 0)
4544 return log_oom();
4545
4546 copy = strdup(value);
4547 if (!copy)
4548 return log_oom();
4549
4550 free((*rt)->tmp_dir);
4551 (*rt)->tmp_dir = copy;
4552
4553 } else if (streq(key, "var-tmp-dir")) {
4554 char *copy;
4555
4556 r = exec_runtime_allocate(rt);
4557 if (r < 0)
4558 return log_oom();
4559
4560 copy = strdup(value);
4561 if (!copy)
4562 return log_oom();
4563
4564 free((*rt)->var_tmp_dir);
4565 (*rt)->var_tmp_dir = copy;
4566
4567 } else if (streq(key, "netns-socket-0")) {
4568 int fd;
4569
4570 r = exec_runtime_allocate(rt);
4571 if (r < 0)
4572 return log_oom();
4573
4574 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4575 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4576 else {
4577 safe_close((*rt)->netns_storage_socket[0]);
4578 (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4579 }
4580 } else if (streq(key, "netns-socket-1")) {
4581 int fd;
4582
4583 r = exec_runtime_allocate(rt);
4584 if (r < 0)
4585 return log_oom();
4586
4587 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4588 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4589 else {
4590 safe_close((*rt)->netns_storage_socket[1]);
4591 (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4592 }
4593 } else
4594 return 0;
4595
4596 return 1;
4597 }
4598
4599 static void *remove_tmpdir_thread(void *p) {
4600 _cleanup_free_ char *path = p;
4601
4602 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4603 return NULL;
4604 }
4605
4606 void exec_runtime_destroy(ExecRuntime *rt) {
4607 int r;
4608
4609 if (!rt)
4610 return;
4611
4612 /* If there are multiple users of this, let's leave the stuff around */
4613 if (rt->n_ref > 1)
4614 return;
4615
4616 if (rt->tmp_dir) {
4617 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4618
4619 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4620 if (r < 0) {
4621 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4622 free(rt->tmp_dir);
4623 }
4624
4625 rt->tmp_dir = NULL;
4626 }
4627
4628 if (rt->var_tmp_dir) {
4629 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4630
4631 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4632 if (r < 0) {
4633 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4634 free(rt->var_tmp_dir);
4635 }
4636
4637 rt->var_tmp_dir = NULL;
4638 }
4639
4640 safe_close_pair(rt->netns_storage_socket);
4641 }
4642
4643 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4644 [EXEC_INPUT_NULL] = "null",
4645 [EXEC_INPUT_TTY] = "tty",
4646 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4647 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4648 [EXEC_INPUT_SOCKET] = "socket",
4649 [EXEC_INPUT_NAMED_FD] = "fd",
4650 [EXEC_INPUT_DATA] = "data",
4651 };
4652
4653 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4654
4655 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4656 [EXEC_OUTPUT_INHERIT] = "inherit",
4657 [EXEC_OUTPUT_NULL] = "null",
4658 [EXEC_OUTPUT_TTY] = "tty",
4659 [EXEC_OUTPUT_SYSLOG] = "syslog",
4660 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4661 [EXEC_OUTPUT_KMSG] = "kmsg",
4662 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4663 [EXEC_OUTPUT_JOURNAL] = "journal",
4664 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4665 [EXEC_OUTPUT_SOCKET] = "socket",
4666 [EXEC_OUTPUT_NAMED_FD] = "fd",
4667 };
4668
4669 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4670
4671 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4672 [EXEC_UTMP_INIT] = "init",
4673 [EXEC_UTMP_LOGIN] = "login",
4674 [EXEC_UTMP_USER] = "user",
4675 };
4676
4677 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4678
4679 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4680 [EXEC_PRESERVE_NO] = "no",
4681 [EXEC_PRESERVE_YES] = "yes",
4682 [EXEC_PRESERVE_RESTART] = "restart",
4683 };
4684
4685 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4686
4687 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4688 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4689 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4690 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4691 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4692 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4693 };
4694
4695 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4696
4697 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4698 [EXEC_KEYRING_INHERIT] = "inherit",
4699 [EXEC_KEYRING_PRIVATE] = "private",
4700 [EXEC_KEYRING_SHARED] = "shared",
4701 };
4702
4703 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);