]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #14874 from bluca/portable_detach_now_block
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/personality.h>
10 #include <sys/prctl.h>
11 #include <sys/shm.h>
12 #include <sys/types.h>
13 #include <sys/un.h>
14 #include <unistd.h>
15 #include <utmpx.h>
16
17 #if HAVE_PAM
18 #include <security/pam_appl.h>
19 #endif
20
21 #if HAVE_SELINUX
22 #include <selinux/selinux.h>
23 #endif
24
25 #if HAVE_SECCOMP
26 #include <seccomp.h>
27 #endif
28
29 #if HAVE_APPARMOR
30 #include <sys/apparmor.h>
31 #endif
32
33 #include "sd-messages.h"
34
35 #include "af-list.h"
36 #include "alloc-util.h"
37 #if HAVE_APPARMOR
38 #include "apparmor-util.h"
39 #endif
40 #include "async.h"
41 #include "barrier.h"
42 #include "cap-list.h"
43 #include "capability-util.h"
44 #include "chown-recursive.h"
45 #include "cgroup-setup.h"
46 #include "cpu-set-util.h"
47 #include "def.h"
48 #include "env-file.h"
49 #include "env-util.h"
50 #include "errno-list.h"
51 #include "execute.h"
52 #include "exit-status.h"
53 #include "fd-util.h"
54 #include "format-util.h"
55 #include "fs-util.h"
56 #include "glob-util.h"
57 #include "io-util.h"
58 #include "ioprio.h"
59 #include "label.h"
60 #include "log.h"
61 #include "macro.h"
62 #include "manager.h"
63 #include "memory-util.h"
64 #include "missing_fs.h"
65 #include "mkdir.h"
66 #include "namespace.h"
67 #include "parse-util.h"
68 #include "path-util.h"
69 #include "process-util.h"
70 #include "rlimit-util.h"
71 #include "rm-rf.h"
72 #if HAVE_SECCOMP
73 #include "seccomp-util.h"
74 #endif
75 #include "securebits-util.h"
76 #include "selinux-util.h"
77 #include "signal-util.h"
78 #include "smack-util.h"
79 #include "socket-util.h"
80 #include "special.h"
81 #include "stat-util.h"
82 #include "string-table.h"
83 #include "string-util.h"
84 #include "strv.h"
85 #include "syslog-util.h"
86 #include "terminal-util.h"
87 #include "umask-util.h"
88 #include "unit.h"
89 #include "user-util.h"
90 #include "utmp-wtmp.h"
91
92 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
93 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
94
95 #define SNDBUF_SIZE (8*1024*1024)
96
97 static int shift_fds(int fds[], size_t n_fds) {
98 int start, restart_from;
99
100 if (n_fds <= 0)
101 return 0;
102
103 /* Modifies the fds array! (sorts it) */
104
105 assert(fds);
106
107 start = 0;
108 for (;;) {
109 int i;
110
111 restart_from = -1;
112
113 for (i = start; i < (int) n_fds; i++) {
114 int nfd;
115
116 /* Already at right index? */
117 if (fds[i] == i+3)
118 continue;
119
120 nfd = fcntl(fds[i], F_DUPFD, i + 3);
121 if (nfd < 0)
122 return -errno;
123
124 safe_close(fds[i]);
125 fds[i] = nfd;
126
127 /* Hmm, the fd we wanted isn't free? Then
128 * let's remember that and try again from here */
129 if (nfd != i+3 && restart_from < 0)
130 restart_from = i;
131 }
132
133 if (restart_from < 0)
134 break;
135
136 start = restart_from;
137 }
138
139 return 0;
140 }
141
142 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
143 size_t i, n_fds;
144 int r;
145
146 n_fds = n_socket_fds + n_storage_fds;
147 if (n_fds <= 0)
148 return 0;
149
150 assert(fds);
151
152 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
153 * O_NONBLOCK only applies to socket activation though. */
154
155 for (i = 0; i < n_fds; i++) {
156
157 if (i < n_socket_fds) {
158 r = fd_nonblock(fds[i], nonblock);
159 if (r < 0)
160 return r;
161 }
162
163 /* We unconditionally drop FD_CLOEXEC from the fds,
164 * since after all we want to pass these fds to our
165 * children */
166
167 r = fd_cloexec(fds[i], false);
168 if (r < 0)
169 return r;
170 }
171
172 return 0;
173 }
174
175 static const char *exec_context_tty_path(const ExecContext *context) {
176 assert(context);
177
178 if (context->stdio_as_fds)
179 return NULL;
180
181 if (context->tty_path)
182 return context->tty_path;
183
184 return "/dev/console";
185 }
186
187 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
188 const char *path;
189
190 assert(context);
191
192 path = exec_context_tty_path(context);
193
194 if (context->tty_vhangup) {
195 if (p && p->stdin_fd >= 0)
196 (void) terminal_vhangup_fd(p->stdin_fd);
197 else if (path)
198 (void) terminal_vhangup(path);
199 }
200
201 if (context->tty_reset) {
202 if (p && p->stdin_fd >= 0)
203 (void) reset_terminal_fd(p->stdin_fd, true);
204 else if (path)
205 (void) reset_terminal(path);
206 }
207
208 if (context->tty_vt_disallocate && path)
209 (void) vt_disallocate(path);
210 }
211
212 static bool is_terminal_input(ExecInput i) {
213 return IN_SET(i,
214 EXEC_INPUT_TTY,
215 EXEC_INPUT_TTY_FORCE,
216 EXEC_INPUT_TTY_FAIL);
217 }
218
219 static bool is_terminal_output(ExecOutput o) {
220 return IN_SET(o,
221 EXEC_OUTPUT_TTY,
222 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
223 EXEC_OUTPUT_KMSG_AND_CONSOLE,
224 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
225 }
226
227 static bool is_syslog_output(ExecOutput o) {
228 return IN_SET(o,
229 EXEC_OUTPUT_SYSLOG,
230 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
231 }
232
233 static bool is_kmsg_output(ExecOutput o) {
234 return IN_SET(o,
235 EXEC_OUTPUT_KMSG,
236 EXEC_OUTPUT_KMSG_AND_CONSOLE);
237 }
238
239 static bool exec_context_needs_term(const ExecContext *c) {
240 assert(c);
241
242 /* Return true if the execution context suggests we should set $TERM to something useful. */
243
244 if (is_terminal_input(c->std_input))
245 return true;
246
247 if (is_terminal_output(c->std_output))
248 return true;
249
250 if (is_terminal_output(c->std_error))
251 return true;
252
253 return !!c->tty_path;
254 }
255
256 static int open_null_as(int flags, int nfd) {
257 int fd;
258
259 assert(nfd >= 0);
260
261 fd = open("/dev/null", flags|O_NOCTTY);
262 if (fd < 0)
263 return -errno;
264
265 return move_fd(fd, nfd, false);
266 }
267
268 static int connect_journal_socket(
269 int fd,
270 const char *log_namespace,
271 uid_t uid,
272 gid_t gid) {
273
274 union sockaddr_union sa;
275 socklen_t sa_len;
276 uid_t olduid = UID_INVALID;
277 gid_t oldgid = GID_INVALID;
278 const char *j;
279 int r;
280
281 j = log_namespace ?
282 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
283 "/run/systemd/journal/stdout";
284 r = sockaddr_un_set_path(&sa.un, j);
285 if (r < 0)
286 return r;
287 sa_len = r;
288
289 if (gid_is_valid(gid)) {
290 oldgid = getgid();
291
292 if (setegid(gid) < 0)
293 return -errno;
294 }
295
296 if (uid_is_valid(uid)) {
297 olduid = getuid();
298
299 if (seteuid(uid) < 0) {
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
305 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
306
307 /* If we fail to restore the uid or gid, things will likely
308 fail later on. This should only happen if an LSM interferes. */
309
310 if (uid_is_valid(uid))
311 (void) seteuid(olduid);
312
313 restore_gid:
314 if (gid_is_valid(gid))
315 (void) setegid(oldgid);
316
317 return r;
318 }
319
320 static int connect_logger_as(
321 const Unit *unit,
322 const ExecContext *context,
323 const ExecParameters *params,
324 ExecOutput output,
325 const char *ident,
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
330 _cleanup_close_ int fd = -1;
331 int r;
332
333 assert(context);
334 assert(params);
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
338
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
341 return -errno;
342
343 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
344 if (r < 0)
345 return r;
346
347 if (shutdown(fd, SHUT_RD) < 0)
348 return -errno;
349
350 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
351
352 if (dprintf(fd,
353 "%s\n"
354 "%s\n"
355 "%i\n"
356 "%i\n"
357 "%i\n"
358 "%i\n"
359 "%i\n",
360 context->syslog_identifier ?: ident,
361 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
362 context->syslog_priority,
363 !!context->syslog_level_prefix,
364 is_syslog_output(output),
365 is_kmsg_output(output),
366 is_terminal_output(output)) < 0)
367 return -errno;
368
369 return move_fd(TAKE_FD(fd), nfd, false);
370 }
371
372 static int open_terminal_as(const char *path, int flags, int nfd) {
373 int fd;
374
375 assert(path);
376 assert(nfd >= 0);
377
378 fd = open_terminal(path, flags | O_NOCTTY);
379 if (fd < 0)
380 return fd;
381
382 return move_fd(fd, nfd, false);
383 }
384
385 static int acquire_path(const char *path, int flags, mode_t mode) {
386 union sockaddr_union sa;
387 socklen_t sa_len;
388 _cleanup_close_ int fd = -1;
389 int r;
390
391 assert(path);
392
393 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
394 flags |= O_CREAT;
395
396 fd = open(path, flags|O_NOCTTY, mode);
397 if (fd >= 0)
398 return TAKE_FD(fd);
399
400 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
401 return -errno;
402
403 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
404
405 r = sockaddr_un_set_path(&sa.un, path);
406 if (r < 0)
407 return r == -EINVAL ? -ENXIO : r;
408 sa_len = r;
409
410 fd = socket(AF_UNIX, SOCK_STREAM, 0);
411 if (fd < 0)
412 return -errno;
413
414 if (connect(fd, &sa.sa, sa_len) < 0)
415 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
416 * indication that his wasn't an AF_UNIX socket after all */
417
418 if ((flags & O_ACCMODE) == O_RDONLY)
419 r = shutdown(fd, SHUT_WR);
420 else if ((flags & O_ACCMODE) == O_WRONLY)
421 r = shutdown(fd, SHUT_RD);
422 else
423 r = 0;
424 if (r < 0)
425 return -errno;
426
427 return TAKE_FD(fd);
428 }
429
430 static int fixup_input(
431 const ExecContext *context,
432 int socket_fd,
433 bool apply_tty_stdin) {
434
435 ExecInput std_input;
436
437 assert(context);
438
439 std_input = context->std_input;
440
441 if (is_terminal_input(std_input) && !apply_tty_stdin)
442 return EXEC_INPUT_NULL;
443
444 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
445 return EXEC_INPUT_NULL;
446
447 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
448 return EXEC_INPUT_NULL;
449
450 return std_input;
451 }
452
453 static int fixup_output(ExecOutput std_output, int socket_fd) {
454
455 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
456 return EXEC_OUTPUT_INHERIT;
457
458 return std_output;
459 }
460
461 static int setup_input(
462 const ExecContext *context,
463 const ExecParameters *params,
464 int socket_fd,
465 const int named_iofds[static 3]) {
466
467 ExecInput i;
468
469 assert(context);
470 assert(params);
471 assert(named_iofds);
472
473 if (params->stdin_fd >= 0) {
474 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
475 return -errno;
476
477 /* Try to make this the controlling tty, if it is a tty, and reset it */
478 if (isatty(STDIN_FILENO)) {
479 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
480 (void) reset_terminal_fd(STDIN_FILENO, true);
481 }
482
483 return STDIN_FILENO;
484 }
485
486 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
487
488 switch (i) {
489
490 case EXEC_INPUT_NULL:
491 return open_null_as(O_RDONLY, STDIN_FILENO);
492
493 case EXEC_INPUT_TTY:
494 case EXEC_INPUT_TTY_FORCE:
495 case EXEC_INPUT_TTY_FAIL: {
496 int fd;
497
498 fd = acquire_terminal(exec_context_tty_path(context),
499 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
500 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
501 ACQUIRE_TERMINAL_WAIT,
502 USEC_INFINITY);
503 if (fd < 0)
504 return fd;
505
506 return move_fd(fd, STDIN_FILENO, false);
507 }
508
509 case EXEC_INPUT_SOCKET:
510 assert(socket_fd >= 0);
511
512 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
513
514 case EXEC_INPUT_NAMED_FD:
515 assert(named_iofds[STDIN_FILENO] >= 0);
516
517 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
518 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
519
520 case EXEC_INPUT_DATA: {
521 int fd;
522
523 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
524 if (fd < 0)
525 return fd;
526
527 return move_fd(fd, STDIN_FILENO, false);
528 }
529
530 case EXEC_INPUT_FILE: {
531 bool rw;
532 int fd;
533
534 assert(context->stdio_file[STDIN_FILENO]);
535
536 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
537 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
538
539 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
540 if (fd < 0)
541 return fd;
542
543 return move_fd(fd, STDIN_FILENO, false);
544 }
545
546 default:
547 assert_not_reached("Unknown input type");
548 }
549 }
550
551 static bool can_inherit_stderr_from_stdout(
552 const ExecContext *context,
553 ExecOutput o,
554 ExecOutput e) {
555
556 assert(context);
557
558 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
559 * stderr fd */
560
561 if (e == EXEC_OUTPUT_INHERIT)
562 return true;
563 if (e != o)
564 return false;
565
566 if (e == EXEC_OUTPUT_NAMED_FD)
567 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
568
569 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
570 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
571
572 return true;
573 }
574
575 static int setup_output(
576 const Unit *unit,
577 const ExecContext *context,
578 const ExecParameters *params,
579 int fileno,
580 int socket_fd,
581 const int named_iofds[static 3],
582 const char *ident,
583 uid_t uid,
584 gid_t gid,
585 dev_t *journal_stream_dev,
586 ino_t *journal_stream_ino) {
587
588 ExecOutput o;
589 ExecInput i;
590 int r;
591
592 assert(unit);
593 assert(context);
594 assert(params);
595 assert(ident);
596 assert(journal_stream_dev);
597 assert(journal_stream_ino);
598
599 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
600
601 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
602 return -errno;
603
604 return STDOUT_FILENO;
605 }
606
607 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
608 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
609 return -errno;
610
611 return STDERR_FILENO;
612 }
613
614 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
615 o = fixup_output(context->std_output, socket_fd);
616
617 if (fileno == STDERR_FILENO) {
618 ExecOutput e;
619 e = fixup_output(context->std_error, socket_fd);
620
621 /* This expects the input and output are already set up */
622
623 /* Don't change the stderr file descriptor if we inherit all
624 * the way and are not on a tty */
625 if (e == EXEC_OUTPUT_INHERIT &&
626 o == EXEC_OUTPUT_INHERIT &&
627 i == EXEC_INPUT_NULL &&
628 !is_terminal_input(context->std_input) &&
629 getppid () != 1)
630 return fileno;
631
632 /* Duplicate from stdout if possible */
633 if (can_inherit_stderr_from_stdout(context, o, e))
634 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
635
636 o = e;
637
638 } else if (o == EXEC_OUTPUT_INHERIT) {
639 /* If input got downgraded, inherit the original value */
640 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
641 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
642
643 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
644 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
645 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
646
647 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
648 if (getppid() != 1)
649 return fileno;
650
651 /* We need to open /dev/null here anew, to get the right access mode. */
652 return open_null_as(O_WRONLY, fileno);
653 }
654
655 switch (o) {
656
657 case EXEC_OUTPUT_NULL:
658 return open_null_as(O_WRONLY, fileno);
659
660 case EXEC_OUTPUT_TTY:
661 if (is_terminal_input(i))
662 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
663
664 /* We don't reset the terminal if this is just about output */
665 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
666
667 case EXEC_OUTPUT_SYSLOG:
668 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
669 case EXEC_OUTPUT_KMSG:
670 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
671 case EXEC_OUTPUT_JOURNAL:
672 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
673 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
674 if (r < 0) {
675 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
676 r = open_null_as(O_WRONLY, fileno);
677 } else {
678 struct stat st;
679
680 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
681 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
682 * services to detect whether they are connected to the journal or not.
683 *
684 * If both stdout and stderr are connected to a stream then let's make sure to store the data
685 * about STDERR as that's usually the best way to do logging. */
686
687 if (fstat(fileno, &st) >= 0 &&
688 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
689 *journal_stream_dev = st.st_dev;
690 *journal_stream_ino = st.st_ino;
691 }
692 }
693 return r;
694
695 case EXEC_OUTPUT_SOCKET:
696 assert(socket_fd >= 0);
697
698 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
699
700 case EXEC_OUTPUT_NAMED_FD:
701 assert(named_iofds[fileno] >= 0);
702
703 (void) fd_nonblock(named_iofds[fileno], false);
704 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
705
706 case EXEC_OUTPUT_FILE:
707 case EXEC_OUTPUT_FILE_APPEND: {
708 bool rw;
709 int fd, flags;
710
711 assert(context->stdio_file[fileno]);
712
713 rw = context->std_input == EXEC_INPUT_FILE &&
714 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
715
716 if (rw)
717 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
718
719 flags = O_WRONLY;
720 if (o == EXEC_OUTPUT_FILE_APPEND)
721 flags |= O_APPEND;
722
723 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
724 if (fd < 0)
725 return fd;
726
727 return move_fd(fd, fileno, 0);
728 }
729
730 default:
731 assert_not_reached("Unknown error type");
732 }
733 }
734
735 static int chown_terminal(int fd, uid_t uid) {
736 int r;
737
738 assert(fd >= 0);
739
740 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
741 if (isatty(fd) < 1) {
742 if (IN_SET(errno, EINVAL, ENOTTY))
743 return 0; /* not a tty */
744
745 return -errno;
746 }
747
748 /* This might fail. What matters are the results. */
749 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
750 if (r < 0)
751 return r;
752
753 return 1;
754 }
755
756 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
757 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
758 int r;
759
760 assert(_saved_stdin);
761 assert(_saved_stdout);
762
763 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
764 if (saved_stdin < 0)
765 return -errno;
766
767 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
768 if (saved_stdout < 0)
769 return -errno;
770
771 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
772 if (fd < 0)
773 return fd;
774
775 r = chown_terminal(fd, getuid());
776 if (r < 0)
777 return r;
778
779 r = reset_terminal_fd(fd, true);
780 if (r < 0)
781 return r;
782
783 r = rearrange_stdio(fd, fd, STDERR_FILENO);
784 fd = -1;
785 if (r < 0)
786 return r;
787
788 *_saved_stdin = saved_stdin;
789 *_saved_stdout = saved_stdout;
790
791 saved_stdin = saved_stdout = -1;
792
793 return 0;
794 }
795
796 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
797 assert(err < 0);
798
799 if (err == -ETIMEDOUT)
800 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
801 else {
802 errno = -err;
803 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
804 }
805 }
806
807 static void write_confirm_error(int err, const char *vc, const Unit *u) {
808 _cleanup_close_ int fd = -1;
809
810 assert(vc);
811
812 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
813 if (fd < 0)
814 return;
815
816 write_confirm_error_fd(err, fd, u);
817 }
818
819 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
820 int r = 0;
821
822 assert(saved_stdin);
823 assert(saved_stdout);
824
825 release_terminal();
826
827 if (*saved_stdin >= 0)
828 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
829 r = -errno;
830
831 if (*saved_stdout >= 0)
832 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
833 r = -errno;
834
835 *saved_stdin = safe_close(*saved_stdin);
836 *saved_stdout = safe_close(*saved_stdout);
837
838 return r;
839 }
840
841 enum {
842 CONFIRM_PRETEND_FAILURE = -1,
843 CONFIRM_PRETEND_SUCCESS = 0,
844 CONFIRM_EXECUTE = 1,
845 };
846
847 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
848 int saved_stdout = -1, saved_stdin = -1, r;
849 _cleanup_free_ char *e = NULL;
850 char c;
851
852 /* For any internal errors, assume a positive response. */
853 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
854 if (r < 0) {
855 write_confirm_error(r, vc, u);
856 return CONFIRM_EXECUTE;
857 }
858
859 /* confirm_spawn might have been disabled while we were sleeping. */
860 if (manager_is_confirm_spawn_disabled(u->manager)) {
861 r = 1;
862 goto restore_stdio;
863 }
864
865 e = ellipsize(cmdline, 60, 100);
866 if (!e) {
867 log_oom();
868 r = CONFIRM_EXECUTE;
869 goto restore_stdio;
870 }
871
872 for (;;) {
873 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
874 if (r < 0) {
875 write_confirm_error_fd(r, STDOUT_FILENO, u);
876 r = CONFIRM_EXECUTE;
877 goto restore_stdio;
878 }
879
880 switch (c) {
881 case 'c':
882 printf("Resuming normal execution.\n");
883 manager_disable_confirm_spawn();
884 r = 1;
885 break;
886 case 'D':
887 unit_dump(u, stdout, " ");
888 continue; /* ask again */
889 case 'f':
890 printf("Failing execution.\n");
891 r = CONFIRM_PRETEND_FAILURE;
892 break;
893 case 'h':
894 printf(" c - continue, proceed without asking anymore\n"
895 " D - dump, show the state of the unit\n"
896 " f - fail, don't execute the command and pretend it failed\n"
897 " h - help\n"
898 " i - info, show a short summary of the unit\n"
899 " j - jobs, show jobs that are in progress\n"
900 " s - skip, don't execute the command and pretend it succeeded\n"
901 " y - yes, execute the command\n");
902 continue; /* ask again */
903 case 'i':
904 printf(" Description: %s\n"
905 " Unit: %s\n"
906 " Command: %s\n",
907 u->id, u->description, cmdline);
908 continue; /* ask again */
909 case 'j':
910 manager_dump_jobs(u->manager, stdout, " ");
911 continue; /* ask again */
912 case 'n':
913 /* 'n' was removed in favor of 'f'. */
914 printf("Didn't understand 'n', did you mean 'f'?\n");
915 continue; /* ask again */
916 case 's':
917 printf("Skipping execution.\n");
918 r = CONFIRM_PRETEND_SUCCESS;
919 break;
920 case 'y':
921 r = CONFIRM_EXECUTE;
922 break;
923 default:
924 assert_not_reached("Unhandled choice");
925 }
926 break;
927 }
928
929 restore_stdio:
930 restore_confirm_stdio(&saved_stdin, &saved_stdout);
931 return r;
932 }
933
934 static int get_fixed_user(const ExecContext *c, const char **user,
935 uid_t *uid, gid_t *gid,
936 const char **home, const char **shell) {
937 int r;
938 const char *name;
939
940 assert(c);
941
942 if (!c->user)
943 return 0;
944
945 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
946 * (i.e. are "/" or "/bin/nologin"). */
947
948 name = c->user;
949 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
950 if (r < 0)
951 return r;
952
953 *user = name;
954 return 0;
955 }
956
957 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
958 int r;
959 const char *name;
960
961 assert(c);
962
963 if (!c->group)
964 return 0;
965
966 name = c->group;
967 r = get_group_creds(&name, gid, 0);
968 if (r < 0)
969 return r;
970
971 *group = name;
972 return 0;
973 }
974
975 static int get_supplementary_groups(const ExecContext *c, const char *user,
976 const char *group, gid_t gid,
977 gid_t **supplementary_gids, int *ngids) {
978 char **i;
979 int r, k = 0;
980 int ngroups_max;
981 bool keep_groups = false;
982 gid_t *groups = NULL;
983 _cleanup_free_ gid_t *l_gids = NULL;
984
985 assert(c);
986
987 /*
988 * If user is given, then lookup GID and supplementary groups list.
989 * We avoid NSS lookups for gid=0. Also we have to initialize groups
990 * here and as early as possible so we keep the list of supplementary
991 * groups of the caller.
992 */
993 if (user && gid_is_valid(gid) && gid != 0) {
994 /* First step, initialize groups from /etc/groups */
995 if (initgroups(user, gid) < 0)
996 return -errno;
997
998 keep_groups = true;
999 }
1000
1001 if (strv_isempty(c->supplementary_groups))
1002 return 0;
1003
1004 /*
1005 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1006 * be positive, otherwise fail.
1007 */
1008 errno = 0;
1009 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1010 if (ngroups_max <= 0)
1011 return errno_or_else(EOPNOTSUPP);
1012
1013 l_gids = new(gid_t, ngroups_max);
1014 if (!l_gids)
1015 return -ENOMEM;
1016
1017 if (keep_groups) {
1018 /*
1019 * Lookup the list of groups that the user belongs to, we
1020 * avoid NSS lookups here too for gid=0.
1021 */
1022 k = ngroups_max;
1023 if (getgrouplist(user, gid, l_gids, &k) < 0)
1024 return -EINVAL;
1025 } else
1026 k = 0;
1027
1028 STRV_FOREACH(i, c->supplementary_groups) {
1029 const char *g;
1030
1031 if (k >= ngroups_max)
1032 return -E2BIG;
1033
1034 g = *i;
1035 r = get_group_creds(&g, l_gids+k, 0);
1036 if (r < 0)
1037 return r;
1038
1039 k++;
1040 }
1041
1042 /*
1043 * Sets ngids to zero to drop all supplementary groups, happens
1044 * when we are under root and SupplementaryGroups= is empty.
1045 */
1046 if (k == 0) {
1047 *ngids = 0;
1048 return 0;
1049 }
1050
1051 /* Otherwise get the final list of supplementary groups */
1052 groups = memdup(l_gids, sizeof(gid_t) * k);
1053 if (!groups)
1054 return -ENOMEM;
1055
1056 *supplementary_gids = groups;
1057 *ngids = k;
1058
1059 groups = NULL;
1060
1061 return 0;
1062 }
1063
1064 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1065 int r;
1066
1067 /* Handle SupplementaryGroups= if it is not empty */
1068 if (ngids > 0) {
1069 r = maybe_setgroups(ngids, supplementary_gids);
1070 if (r < 0)
1071 return r;
1072 }
1073
1074 if (gid_is_valid(gid)) {
1075 /* Then set our gids */
1076 if (setresgid(gid, gid, gid) < 0)
1077 return -errno;
1078 }
1079
1080 return 0;
1081 }
1082
1083 static int enforce_user(const ExecContext *context, uid_t uid) {
1084 assert(context);
1085
1086 if (!uid_is_valid(uid))
1087 return 0;
1088
1089 /* Sets (but doesn't look up) the uid and make sure we keep the
1090 * capabilities while doing so. */
1091
1092 if (context->capability_ambient_set != 0) {
1093
1094 /* First step: If we need to keep capabilities but
1095 * drop privileges we need to make sure we keep our
1096 * caps, while we drop privileges. */
1097 if (uid != 0) {
1098 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1099
1100 if (prctl(PR_GET_SECUREBITS) != sb)
1101 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1102 return -errno;
1103 }
1104 }
1105
1106 /* Second step: actually set the uids */
1107 if (setresuid(uid, uid, uid) < 0)
1108 return -errno;
1109
1110 /* At this point we should have all necessary capabilities but
1111 are otherwise a normal user. However, the caps might got
1112 corrupted due to the setresuid() so we need clean them up
1113 later. This is done outside of this call. */
1114
1115 return 0;
1116 }
1117
1118 #if HAVE_PAM
1119
1120 static int null_conv(
1121 int num_msg,
1122 const struct pam_message **msg,
1123 struct pam_response **resp,
1124 void *appdata_ptr) {
1125
1126 /* We don't support conversations */
1127
1128 return PAM_CONV_ERR;
1129 }
1130
1131 #endif
1132
1133 static int setup_pam(
1134 const char *name,
1135 const char *user,
1136 uid_t uid,
1137 gid_t gid,
1138 const char *tty,
1139 char ***env,
1140 const int fds[], size_t n_fds) {
1141
1142 #if HAVE_PAM
1143
1144 static const struct pam_conv conv = {
1145 .conv = null_conv,
1146 .appdata_ptr = NULL
1147 };
1148
1149 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1150 pam_handle_t *handle = NULL;
1151 sigset_t old_ss;
1152 int pam_code = PAM_SUCCESS, r;
1153 char **nv, **e = NULL;
1154 bool close_session = false;
1155 pid_t pam_pid = 0, parent_pid;
1156 int flags = 0;
1157
1158 assert(name);
1159 assert(user);
1160 assert(env);
1161
1162 /* We set up PAM in the parent process, then fork. The child
1163 * will then stay around until killed via PR_GET_PDEATHSIG or
1164 * systemd via the cgroup logic. It will then remove the PAM
1165 * session again. The parent process will exec() the actual
1166 * daemon. We do things this way to ensure that the main PID
1167 * of the daemon is the one we initially fork()ed. */
1168
1169 r = barrier_create(&barrier);
1170 if (r < 0)
1171 goto fail;
1172
1173 if (log_get_max_level() < LOG_DEBUG)
1174 flags |= PAM_SILENT;
1175
1176 pam_code = pam_start(name, user, &conv, &handle);
1177 if (pam_code != PAM_SUCCESS) {
1178 handle = NULL;
1179 goto fail;
1180 }
1181
1182 if (!tty) {
1183 _cleanup_free_ char *q = NULL;
1184
1185 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1186 * out if that's the case, and read the TTY off it. */
1187
1188 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1189 tty = strjoina("/dev/", q);
1190 }
1191
1192 if (tty) {
1193 pam_code = pam_set_item(handle, PAM_TTY, tty);
1194 if (pam_code != PAM_SUCCESS)
1195 goto fail;
1196 }
1197
1198 STRV_FOREACH(nv, *env) {
1199 pam_code = pam_putenv(handle, *nv);
1200 if (pam_code != PAM_SUCCESS)
1201 goto fail;
1202 }
1203
1204 pam_code = pam_acct_mgmt(handle, flags);
1205 if (pam_code != PAM_SUCCESS)
1206 goto fail;
1207
1208 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1209 if (pam_code != PAM_SUCCESS)
1210 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1211
1212 pam_code = pam_open_session(handle, flags);
1213 if (pam_code != PAM_SUCCESS)
1214 goto fail;
1215
1216 close_session = true;
1217
1218 e = pam_getenvlist(handle);
1219 if (!e) {
1220 pam_code = PAM_BUF_ERR;
1221 goto fail;
1222 }
1223
1224 /* Block SIGTERM, so that we know that it won't get lost in
1225 * the child */
1226
1227 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1228
1229 parent_pid = getpid_cached();
1230
1231 r = safe_fork("(sd-pam)", 0, &pam_pid);
1232 if (r < 0)
1233 goto fail;
1234 if (r == 0) {
1235 int sig, ret = EXIT_PAM;
1236
1237 /* The child's job is to reset the PAM session on
1238 * termination */
1239 barrier_set_role(&barrier, BARRIER_CHILD);
1240
1241 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1242 * are open here that have been opened by PAM. */
1243 (void) close_many(fds, n_fds);
1244
1245 /* Drop privileges - we don't need any to pam_close_session
1246 * and this will make PR_SET_PDEATHSIG work in most cases.
1247 * If this fails, ignore the error - but expect sd-pam threads
1248 * to fail to exit normally */
1249
1250 r = maybe_setgroups(0, NULL);
1251 if (r < 0)
1252 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1253 if (setresgid(gid, gid, gid) < 0)
1254 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1255 if (setresuid(uid, uid, uid) < 0)
1256 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1257
1258 (void) ignore_signals(SIGPIPE, -1);
1259
1260 /* Wait until our parent died. This will only work if
1261 * the above setresuid() succeeds, otherwise the kernel
1262 * will not allow unprivileged parents kill their privileged
1263 * children this way. We rely on the control groups kill logic
1264 * to do the rest for us. */
1265 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1266 goto child_finish;
1267
1268 /* Tell the parent that our setup is done. This is especially
1269 * important regarding dropping privileges. Otherwise, unit
1270 * setup might race against our setresuid(2) call.
1271 *
1272 * If the parent aborted, we'll detect this below, hence ignore
1273 * return failure here. */
1274 (void) barrier_place(&barrier);
1275
1276 /* Check if our parent process might already have died? */
1277 if (getppid() == parent_pid) {
1278 sigset_t ss;
1279
1280 assert_se(sigemptyset(&ss) >= 0);
1281 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1282
1283 for (;;) {
1284 if (sigwait(&ss, &sig) < 0) {
1285 if (errno == EINTR)
1286 continue;
1287
1288 goto child_finish;
1289 }
1290
1291 assert(sig == SIGTERM);
1292 break;
1293 }
1294 }
1295
1296 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1297 if (pam_code != PAM_SUCCESS)
1298 goto child_finish;
1299
1300 /* If our parent died we'll end the session */
1301 if (getppid() != parent_pid) {
1302 pam_code = pam_close_session(handle, flags);
1303 if (pam_code != PAM_SUCCESS)
1304 goto child_finish;
1305 }
1306
1307 ret = 0;
1308
1309 child_finish:
1310 pam_end(handle, pam_code | flags);
1311 _exit(ret);
1312 }
1313
1314 barrier_set_role(&barrier, BARRIER_PARENT);
1315
1316 /* If the child was forked off successfully it will do all the
1317 * cleanups, so forget about the handle here. */
1318 handle = NULL;
1319
1320 /* Unblock SIGTERM again in the parent */
1321 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1322
1323 /* We close the log explicitly here, since the PAM modules
1324 * might have opened it, but we don't want this fd around. */
1325 closelog();
1326
1327 /* Synchronously wait for the child to initialize. We don't care for
1328 * errors as we cannot recover. However, warn loudly if it happens. */
1329 if (!barrier_place_and_sync(&barrier))
1330 log_error("PAM initialization failed");
1331
1332 return strv_free_and_replace(*env, e);
1333
1334 fail:
1335 if (pam_code != PAM_SUCCESS) {
1336 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1337 r = -EPERM; /* PAM errors do not map to errno */
1338 } else
1339 log_error_errno(r, "PAM failed: %m");
1340
1341 if (handle) {
1342 if (close_session)
1343 pam_code = pam_close_session(handle, flags);
1344
1345 pam_end(handle, pam_code | flags);
1346 }
1347
1348 strv_free(e);
1349 closelog();
1350
1351 return r;
1352 #else
1353 return 0;
1354 #endif
1355 }
1356
1357 static void rename_process_from_path(const char *path) {
1358 char process_name[11];
1359 const char *p;
1360 size_t l;
1361
1362 /* This resulting string must fit in 10 chars (i.e. the length
1363 * of "/sbin/init") to look pretty in /bin/ps */
1364
1365 p = basename(path);
1366 if (isempty(p)) {
1367 rename_process("(...)");
1368 return;
1369 }
1370
1371 l = strlen(p);
1372 if (l > 8) {
1373 /* The end of the process name is usually more
1374 * interesting, since the first bit might just be
1375 * "systemd-" */
1376 p = p + l - 8;
1377 l = 8;
1378 }
1379
1380 process_name[0] = '(';
1381 memcpy(process_name+1, p, l);
1382 process_name[1+l] = ')';
1383 process_name[1+l+1] = 0;
1384
1385 rename_process(process_name);
1386 }
1387
1388 static bool context_has_address_families(const ExecContext *c) {
1389 assert(c);
1390
1391 return c->address_families_whitelist ||
1392 !set_isempty(c->address_families);
1393 }
1394
1395 static bool context_has_syscall_filters(const ExecContext *c) {
1396 assert(c);
1397
1398 return c->syscall_whitelist ||
1399 !hashmap_isempty(c->syscall_filter);
1400 }
1401
1402 static bool context_has_no_new_privileges(const ExecContext *c) {
1403 assert(c);
1404
1405 if (c->no_new_privileges)
1406 return true;
1407
1408 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1409 return false;
1410
1411 /* We need NNP if we have any form of seccomp and are unprivileged */
1412 return context_has_address_families(c) ||
1413 c->memory_deny_write_execute ||
1414 c->restrict_realtime ||
1415 c->restrict_suid_sgid ||
1416 exec_context_restrict_namespaces_set(c) ||
1417 c->protect_clock ||
1418 c->protect_kernel_tunables ||
1419 c->protect_kernel_modules ||
1420 c->protect_kernel_logs ||
1421 c->private_devices ||
1422 context_has_syscall_filters(c) ||
1423 !set_isempty(c->syscall_archs) ||
1424 c->lock_personality ||
1425 c->protect_hostname;
1426 }
1427
1428 #if HAVE_SECCOMP
1429
1430 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1431
1432 if (is_seccomp_available())
1433 return false;
1434
1435 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1436 return true;
1437 }
1438
1439 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1440 uint32_t negative_action, default_action, action;
1441 int r;
1442
1443 assert(u);
1444 assert(c);
1445
1446 if (!context_has_syscall_filters(c))
1447 return 0;
1448
1449 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1450 return 0;
1451
1452 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1453
1454 if (c->syscall_whitelist) {
1455 default_action = negative_action;
1456 action = SCMP_ACT_ALLOW;
1457 } else {
1458 default_action = SCMP_ACT_ALLOW;
1459 action = negative_action;
1460 }
1461
1462 if (needs_ambient_hack) {
1463 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1464 if (r < 0)
1465 return r;
1466 }
1467
1468 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1469 }
1470
1471 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1472 assert(u);
1473 assert(c);
1474
1475 if (set_isempty(c->syscall_archs))
1476 return 0;
1477
1478 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1479 return 0;
1480
1481 return seccomp_restrict_archs(c->syscall_archs);
1482 }
1483
1484 static int apply_address_families(const Unit* u, const ExecContext *c) {
1485 assert(u);
1486 assert(c);
1487
1488 if (!context_has_address_families(c))
1489 return 0;
1490
1491 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1492 return 0;
1493
1494 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1495 }
1496
1497 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1498 assert(u);
1499 assert(c);
1500
1501 if (!c->memory_deny_write_execute)
1502 return 0;
1503
1504 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1505 return 0;
1506
1507 return seccomp_memory_deny_write_execute();
1508 }
1509
1510 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1511 assert(u);
1512 assert(c);
1513
1514 if (!c->restrict_realtime)
1515 return 0;
1516
1517 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1518 return 0;
1519
1520 return seccomp_restrict_realtime();
1521 }
1522
1523 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1524 assert(u);
1525 assert(c);
1526
1527 if (!c->restrict_suid_sgid)
1528 return 0;
1529
1530 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1531 return 0;
1532
1533 return seccomp_restrict_suid_sgid();
1534 }
1535
1536 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1537 assert(u);
1538 assert(c);
1539
1540 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1541 * let's protect even those systems where this is left on in the kernel. */
1542
1543 if (!c->protect_kernel_tunables)
1544 return 0;
1545
1546 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1547 return 0;
1548
1549 return seccomp_protect_sysctl();
1550 }
1551
1552 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1553 assert(u);
1554 assert(c);
1555
1556 /* Turn off module syscalls on ProtectKernelModules=yes */
1557
1558 if (!c->protect_kernel_modules)
1559 return 0;
1560
1561 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1562 return 0;
1563
1564 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1565 }
1566
1567 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1568 assert(u);
1569 assert(c);
1570
1571 if (!c->protect_kernel_logs)
1572 return 0;
1573
1574 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1575 return 0;
1576
1577 return seccomp_protect_syslog();
1578 }
1579
1580 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1581 assert(u);
1582 assert(c);
1583
1584 if (!c->protect_clock)
1585 return 0;
1586
1587 if (skip_seccomp_unavailable(u, "ProtectClock="))
1588 return 0;
1589
1590 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1591 }
1592
1593 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1594 assert(u);
1595 assert(c);
1596
1597 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1598
1599 if (!c->private_devices)
1600 return 0;
1601
1602 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1603 return 0;
1604
1605 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1606 }
1607
1608 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1609 assert(u);
1610 assert(c);
1611
1612 if (!exec_context_restrict_namespaces_set(c))
1613 return 0;
1614
1615 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1616 return 0;
1617
1618 return seccomp_restrict_namespaces(c->restrict_namespaces);
1619 }
1620
1621 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1622 unsigned long personality;
1623 int r;
1624
1625 assert(u);
1626 assert(c);
1627
1628 if (!c->lock_personality)
1629 return 0;
1630
1631 if (skip_seccomp_unavailable(u, "LockPersonality="))
1632 return 0;
1633
1634 personality = c->personality;
1635
1636 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1637 if (personality == PERSONALITY_INVALID) {
1638
1639 r = opinionated_personality(&personality);
1640 if (r < 0)
1641 return r;
1642 }
1643
1644 return seccomp_lock_personality(personality);
1645 }
1646
1647 #endif
1648
1649 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1650 assert(idle_pipe);
1651
1652 idle_pipe[1] = safe_close(idle_pipe[1]);
1653 idle_pipe[2] = safe_close(idle_pipe[2]);
1654
1655 if (idle_pipe[0] >= 0) {
1656 int r;
1657
1658 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1659
1660 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1661 ssize_t n;
1662
1663 /* Signal systemd that we are bored and want to continue. */
1664 n = write(idle_pipe[3], "x", 1);
1665 if (n > 0)
1666 /* Wait for systemd to react to the signal above. */
1667 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1668 }
1669
1670 idle_pipe[0] = safe_close(idle_pipe[0]);
1671
1672 }
1673
1674 idle_pipe[3] = safe_close(idle_pipe[3]);
1675 }
1676
1677 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1678
1679 static int build_environment(
1680 const Unit *u,
1681 const ExecContext *c,
1682 const ExecParameters *p,
1683 size_t n_fds,
1684 const char *home,
1685 const char *username,
1686 const char *shell,
1687 dev_t journal_stream_dev,
1688 ino_t journal_stream_ino,
1689 char ***ret) {
1690
1691 _cleanup_strv_free_ char **our_env = NULL;
1692 ExecDirectoryType t;
1693 size_t n_env = 0;
1694 char *x;
1695
1696 assert(u);
1697 assert(c);
1698 assert(p);
1699 assert(ret);
1700
1701 our_env = new0(char*, 15 + _EXEC_DIRECTORY_TYPE_MAX);
1702 if (!our_env)
1703 return -ENOMEM;
1704
1705 if (n_fds > 0) {
1706 _cleanup_free_ char *joined = NULL;
1707
1708 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1709 return -ENOMEM;
1710 our_env[n_env++] = x;
1711
1712 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1713 return -ENOMEM;
1714 our_env[n_env++] = x;
1715
1716 joined = strv_join(p->fd_names, ":");
1717 if (!joined)
1718 return -ENOMEM;
1719
1720 x = strjoin("LISTEN_FDNAMES=", joined);
1721 if (!x)
1722 return -ENOMEM;
1723 our_env[n_env++] = x;
1724 }
1725
1726 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1727 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1728 return -ENOMEM;
1729 our_env[n_env++] = x;
1730
1731 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1732 return -ENOMEM;
1733 our_env[n_env++] = x;
1734 }
1735
1736 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1737 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1738 * check the database directly. */
1739 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1740 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1741 if (!x)
1742 return -ENOMEM;
1743 our_env[n_env++] = x;
1744 }
1745
1746 if (home) {
1747 x = strjoin("HOME=", home);
1748 if (!x)
1749 return -ENOMEM;
1750
1751 path_simplify(x + 5, true);
1752 our_env[n_env++] = x;
1753 }
1754
1755 if (username) {
1756 x = strjoin("LOGNAME=", username);
1757 if (!x)
1758 return -ENOMEM;
1759 our_env[n_env++] = x;
1760
1761 x = strjoin("USER=", username);
1762 if (!x)
1763 return -ENOMEM;
1764 our_env[n_env++] = x;
1765 }
1766
1767 if (shell) {
1768 x = strjoin("SHELL=", shell);
1769 if (!x)
1770 return -ENOMEM;
1771
1772 path_simplify(x + 6, true);
1773 our_env[n_env++] = x;
1774 }
1775
1776 if (!sd_id128_is_null(u->invocation_id)) {
1777 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1778 return -ENOMEM;
1779
1780 our_env[n_env++] = x;
1781 }
1782
1783 if (exec_context_needs_term(c)) {
1784 const char *tty_path, *term = NULL;
1785
1786 tty_path = exec_context_tty_path(c);
1787
1788 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1789 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1790 * passes to PID 1 ends up all the way in the console login shown. */
1791
1792 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1793 term = getenv("TERM");
1794 if (!term)
1795 term = default_term_for_tty(tty_path);
1796
1797 x = strjoin("TERM=", term);
1798 if (!x)
1799 return -ENOMEM;
1800 our_env[n_env++] = x;
1801 }
1802
1803 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1804 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1805 return -ENOMEM;
1806
1807 our_env[n_env++] = x;
1808 }
1809
1810 if (c->log_namespace) {
1811 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1812 if (!x)
1813 return -ENOMEM;
1814
1815 our_env[n_env++] = x;
1816 }
1817
1818 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1819 _cleanup_free_ char *pre = NULL, *joined = NULL;
1820 const char *n;
1821
1822 if (!p->prefix[t])
1823 continue;
1824
1825 if (strv_isempty(c->directories[t].paths))
1826 continue;
1827
1828 n = exec_directory_env_name_to_string(t);
1829 if (!n)
1830 continue;
1831
1832 pre = strjoin(p->prefix[t], "/");
1833 if (!pre)
1834 return -ENOMEM;
1835
1836 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1837 if (!joined)
1838 return -ENOMEM;
1839
1840 x = strjoin(n, "=", joined);
1841 if (!x)
1842 return -ENOMEM;
1843
1844 our_env[n_env++] = x;
1845 }
1846
1847 our_env[n_env++] = NULL;
1848 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1849
1850 *ret = TAKE_PTR(our_env);
1851
1852 return 0;
1853 }
1854
1855 static int build_pass_environment(const ExecContext *c, char ***ret) {
1856 _cleanup_strv_free_ char **pass_env = NULL;
1857 size_t n_env = 0, n_bufsize = 0;
1858 char **i;
1859
1860 STRV_FOREACH(i, c->pass_environment) {
1861 _cleanup_free_ char *x = NULL;
1862 char *v;
1863
1864 v = getenv(*i);
1865 if (!v)
1866 continue;
1867 x = strjoin(*i, "=", v);
1868 if (!x)
1869 return -ENOMEM;
1870
1871 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1872 return -ENOMEM;
1873
1874 pass_env[n_env++] = TAKE_PTR(x);
1875 pass_env[n_env] = NULL;
1876 }
1877
1878 *ret = TAKE_PTR(pass_env);
1879
1880 return 0;
1881 }
1882
1883 static bool exec_needs_mount_namespace(
1884 const ExecContext *context,
1885 const ExecParameters *params,
1886 const ExecRuntime *runtime) {
1887
1888 assert(context);
1889 assert(params);
1890
1891 if (context->root_image)
1892 return true;
1893
1894 if (!strv_isempty(context->read_write_paths) ||
1895 !strv_isempty(context->read_only_paths) ||
1896 !strv_isempty(context->inaccessible_paths))
1897 return true;
1898
1899 if (context->n_bind_mounts > 0)
1900 return true;
1901
1902 if (context->n_temporary_filesystems > 0)
1903 return true;
1904
1905 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1906 return true;
1907
1908 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1909 return true;
1910
1911 if (context->private_devices ||
1912 context->private_mounts ||
1913 context->protect_system != PROTECT_SYSTEM_NO ||
1914 context->protect_home != PROTECT_HOME_NO ||
1915 context->protect_kernel_tunables ||
1916 context->protect_kernel_modules ||
1917 context->protect_kernel_logs ||
1918 context->protect_control_groups)
1919 return true;
1920
1921 if (context->root_directory) {
1922 ExecDirectoryType t;
1923
1924 if (context->mount_apivfs)
1925 return true;
1926
1927 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1928 if (!params->prefix[t])
1929 continue;
1930
1931 if (!strv_isempty(context->directories[t].paths))
1932 return true;
1933 }
1934 }
1935
1936 if (context->dynamic_user &&
1937 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1938 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1939 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1940 return true;
1941
1942 if (context->log_namespace)
1943 return true;
1944
1945 return false;
1946 }
1947
1948 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1949 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1950 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1951 _cleanup_close_ int unshare_ready_fd = -1;
1952 _cleanup_(sigkill_waitp) pid_t pid = 0;
1953 uint64_t c = 1;
1954 ssize_t n;
1955 int r;
1956
1957 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1958 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1959 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1960 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1961 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1962 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1963 * continues execution normally.
1964 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
1965 * does not need CAP_SETUID to write the single line mapping to itself. */
1966
1967 /* Can only set up multiple mappings with CAP_SETUID. */
1968 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
1969 r = asprintf(&uid_map,
1970 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
1971 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1972 ouid, ouid, uid, uid);
1973 else
1974 r = asprintf(&uid_map,
1975 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
1976 ouid, ouid);
1977
1978 if (r < 0)
1979 return -ENOMEM;
1980
1981 /* Can only set up multiple mappings with CAP_SETGID. */
1982 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
1983 r = asprintf(&gid_map,
1984 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
1985 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1986 ogid, ogid, gid, gid);
1987 else
1988 r = asprintf(&gid_map,
1989 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
1990 ogid, ogid);
1991
1992 if (r < 0)
1993 return -ENOMEM;
1994
1995 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1996 * namespace. */
1997 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1998 if (unshare_ready_fd < 0)
1999 return -errno;
2000
2001 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2002 * failed. */
2003 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2004 return -errno;
2005
2006 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2007 if (r < 0)
2008 return r;
2009 if (r == 0) {
2010 _cleanup_close_ int fd = -1;
2011 const char *a;
2012 pid_t ppid;
2013
2014 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2015 * here, after the parent opened its own user namespace. */
2016
2017 ppid = getppid();
2018 errno_pipe[0] = safe_close(errno_pipe[0]);
2019
2020 /* Wait until the parent unshared the user namespace */
2021 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2022 r = -errno;
2023 goto child_fail;
2024 }
2025
2026 /* Disable the setgroups() system call in the child user namespace, for good. */
2027 a = procfs_file_alloca(ppid, "setgroups");
2028 fd = open(a, O_WRONLY|O_CLOEXEC);
2029 if (fd < 0) {
2030 if (errno != ENOENT) {
2031 r = -errno;
2032 goto child_fail;
2033 }
2034
2035 /* If the file is missing the kernel is too old, let's continue anyway. */
2036 } else {
2037 if (write(fd, "deny\n", 5) < 0) {
2038 r = -errno;
2039 goto child_fail;
2040 }
2041
2042 fd = safe_close(fd);
2043 }
2044
2045 /* First write the GID map */
2046 a = procfs_file_alloca(ppid, "gid_map");
2047 fd = open(a, O_WRONLY|O_CLOEXEC);
2048 if (fd < 0) {
2049 r = -errno;
2050 goto child_fail;
2051 }
2052 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2053 r = -errno;
2054 goto child_fail;
2055 }
2056 fd = safe_close(fd);
2057
2058 /* The write the UID map */
2059 a = procfs_file_alloca(ppid, "uid_map");
2060 fd = open(a, O_WRONLY|O_CLOEXEC);
2061 if (fd < 0) {
2062 r = -errno;
2063 goto child_fail;
2064 }
2065 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2066 r = -errno;
2067 goto child_fail;
2068 }
2069
2070 _exit(EXIT_SUCCESS);
2071
2072 child_fail:
2073 (void) write(errno_pipe[1], &r, sizeof(r));
2074 _exit(EXIT_FAILURE);
2075 }
2076
2077 errno_pipe[1] = safe_close(errno_pipe[1]);
2078
2079 if (unshare(CLONE_NEWUSER) < 0)
2080 return -errno;
2081
2082 /* Let the child know that the namespace is ready now */
2083 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2084 return -errno;
2085
2086 /* Try to read an error code from the child */
2087 n = read(errno_pipe[0], &r, sizeof(r));
2088 if (n < 0)
2089 return -errno;
2090 if (n == sizeof(r)) { /* an error code was sent to us */
2091 if (r < 0)
2092 return r;
2093 return -EIO;
2094 }
2095 if (n != 0) /* on success we should have read 0 bytes */
2096 return -EIO;
2097
2098 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2099 pid = 0;
2100 if (r < 0)
2101 return r;
2102 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2103 return -EIO;
2104
2105 return 0;
2106 }
2107
2108 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2109 if (!context->dynamic_user)
2110 return false;
2111
2112 if (type == EXEC_DIRECTORY_CONFIGURATION)
2113 return false;
2114
2115 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2116 return false;
2117
2118 return true;
2119 }
2120
2121 static int setup_exec_directory(
2122 const ExecContext *context,
2123 const ExecParameters *params,
2124 uid_t uid,
2125 gid_t gid,
2126 ExecDirectoryType type,
2127 int *exit_status) {
2128
2129 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2130 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2131 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2132 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2133 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2134 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2135 };
2136 char **rt;
2137 int r;
2138
2139 assert(context);
2140 assert(params);
2141 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2142 assert(exit_status);
2143
2144 if (!params->prefix[type])
2145 return 0;
2146
2147 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2148 if (!uid_is_valid(uid))
2149 uid = 0;
2150 if (!gid_is_valid(gid))
2151 gid = 0;
2152 }
2153
2154 STRV_FOREACH(rt, context->directories[type].paths) {
2155 _cleanup_free_ char *p = NULL, *pp = NULL;
2156
2157 p = path_join(params->prefix[type], *rt);
2158 if (!p) {
2159 r = -ENOMEM;
2160 goto fail;
2161 }
2162
2163 r = mkdir_parents_label(p, 0755);
2164 if (r < 0)
2165 goto fail;
2166
2167 if (exec_directory_is_private(context, type)) {
2168 _cleanup_free_ char *private_root = NULL;
2169
2170 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2171 * case we want to avoid leaving a directory around fully accessible that is owned by
2172 * a dynamic user whose UID is later on reused. To lock this down we use the same
2173 * trick used by container managers to prohibit host users to get access to files of
2174 * the same UID in containers: we place everything inside a directory that has an
2175 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2176 * for unprivileged host code. We then use fs namespacing to make this directory
2177 * permeable for the service itself.
2178 *
2179 * Specifically: for a service which wants a special directory "foo/" we first create
2180 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2181 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2182 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2183 * unprivileged host users can't look into it. Inside of the namespace of the unit
2184 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2185 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2186 * for the service and making sure it only gets access to the dirs it needs but no
2187 * others. Tricky? Yes, absolutely, but it works!
2188 *
2189 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2190 * to be owned by the service itself.
2191 *
2192 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2193 * for sharing files or sockets with other services. */
2194
2195 private_root = path_join(params->prefix[type], "private");
2196 if (!private_root) {
2197 r = -ENOMEM;
2198 goto fail;
2199 }
2200
2201 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2202 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2203 if (r < 0)
2204 goto fail;
2205
2206 pp = path_join(private_root, *rt);
2207 if (!pp) {
2208 r = -ENOMEM;
2209 goto fail;
2210 }
2211
2212 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2213 r = mkdir_parents_label(pp, 0755);
2214 if (r < 0)
2215 goto fail;
2216
2217 if (is_dir(p, false) > 0 &&
2218 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2219
2220 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2221 * it over. Most likely the service has been upgraded from one that didn't use
2222 * DynamicUser=1, to one that does. */
2223
2224 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2225 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2226 exec_directory_type_to_string(type), p, pp);
2227
2228 if (rename(p, pp) < 0) {
2229 r = -errno;
2230 goto fail;
2231 }
2232 } else {
2233 /* Otherwise, create the actual directory for the service */
2234
2235 r = mkdir_label(pp, context->directories[type].mode);
2236 if (r < 0 && r != -EEXIST)
2237 goto fail;
2238 }
2239
2240 /* And link it up from the original place */
2241 r = symlink_idempotent(pp, p, true);
2242 if (r < 0)
2243 goto fail;
2244
2245 } else {
2246 _cleanup_free_ char *target = NULL;
2247
2248 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2249 readlink_and_make_absolute(p, &target) >= 0) {
2250 _cleanup_free_ char *q = NULL;
2251
2252 /* This already exists and is a symlink? Interesting. Maybe it's one created
2253 * by DynamicUser=1 (see above)?
2254 *
2255 * We do this for all directory types except for ConfigurationDirectory=,
2256 * since they all support the private/ symlink logic at least in some
2257 * configurations, see above. */
2258
2259 q = path_join(params->prefix[type], "private", *rt);
2260 if (!q) {
2261 r = -ENOMEM;
2262 goto fail;
2263 }
2264
2265 if (path_equal(q, target)) {
2266
2267 /* Hmm, apparently DynamicUser= was once turned on for this service,
2268 * but is no longer. Let's move the directory back up. */
2269
2270 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2271 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2272 exec_directory_type_to_string(type), q, p);
2273
2274 if (unlink(p) < 0) {
2275 r = -errno;
2276 goto fail;
2277 }
2278
2279 if (rename(q, p) < 0) {
2280 r = -errno;
2281 goto fail;
2282 }
2283 }
2284 }
2285
2286 r = mkdir_label(p, context->directories[type].mode);
2287 if (r < 0) {
2288 if (r != -EEXIST)
2289 goto fail;
2290
2291 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2292 struct stat st;
2293
2294 /* Don't change the owner/access mode of the configuration directory,
2295 * as in the common case it is not written to by a service, and shall
2296 * not be writable. */
2297
2298 if (stat(p, &st) < 0) {
2299 r = -errno;
2300 goto fail;
2301 }
2302
2303 /* Still complain if the access mode doesn't match */
2304 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2305 log_warning("%s \'%s\' already exists but the mode is different. "
2306 "(File system: %o %sMode: %o)",
2307 exec_directory_type_to_string(type), *rt,
2308 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2309
2310 continue;
2311 }
2312 }
2313 }
2314
2315 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2316 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2317 * current UID/GID ownership.) */
2318 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2319 if (r < 0)
2320 goto fail;
2321
2322 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2323 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2324 * assignments to exist.*/
2325 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2326 if (r < 0)
2327 goto fail;
2328 }
2329
2330 return 0;
2331
2332 fail:
2333 *exit_status = exit_status_table[type];
2334 return r;
2335 }
2336
2337 #if ENABLE_SMACK
2338 static int setup_smack(
2339 const ExecContext *context,
2340 const ExecCommand *command) {
2341
2342 int r;
2343
2344 assert(context);
2345 assert(command);
2346
2347 if (context->smack_process_label) {
2348 r = mac_smack_apply_pid(0, context->smack_process_label);
2349 if (r < 0)
2350 return r;
2351 }
2352 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2353 else {
2354 _cleanup_free_ char *exec_label = NULL;
2355
2356 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2357 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2358 return r;
2359
2360 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2361 if (r < 0)
2362 return r;
2363 }
2364 #endif
2365
2366 return 0;
2367 }
2368 #endif
2369
2370 static int compile_bind_mounts(
2371 const ExecContext *context,
2372 const ExecParameters *params,
2373 BindMount **ret_bind_mounts,
2374 size_t *ret_n_bind_mounts,
2375 char ***ret_empty_directories) {
2376
2377 _cleanup_strv_free_ char **empty_directories = NULL;
2378 BindMount *bind_mounts;
2379 size_t n, h = 0, i;
2380 ExecDirectoryType t;
2381 int r;
2382
2383 assert(context);
2384 assert(params);
2385 assert(ret_bind_mounts);
2386 assert(ret_n_bind_mounts);
2387 assert(ret_empty_directories);
2388
2389 n = context->n_bind_mounts;
2390 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2391 if (!params->prefix[t])
2392 continue;
2393
2394 n += strv_length(context->directories[t].paths);
2395 }
2396
2397 if (n <= 0) {
2398 *ret_bind_mounts = NULL;
2399 *ret_n_bind_mounts = 0;
2400 *ret_empty_directories = NULL;
2401 return 0;
2402 }
2403
2404 bind_mounts = new(BindMount, n);
2405 if (!bind_mounts)
2406 return -ENOMEM;
2407
2408 for (i = 0; i < context->n_bind_mounts; i++) {
2409 BindMount *item = context->bind_mounts + i;
2410 char *s, *d;
2411
2412 s = strdup(item->source);
2413 if (!s) {
2414 r = -ENOMEM;
2415 goto finish;
2416 }
2417
2418 d = strdup(item->destination);
2419 if (!d) {
2420 free(s);
2421 r = -ENOMEM;
2422 goto finish;
2423 }
2424
2425 bind_mounts[h++] = (BindMount) {
2426 .source = s,
2427 .destination = d,
2428 .read_only = item->read_only,
2429 .recursive = item->recursive,
2430 .ignore_enoent = item->ignore_enoent,
2431 };
2432 }
2433
2434 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2435 char **suffix;
2436
2437 if (!params->prefix[t])
2438 continue;
2439
2440 if (strv_isempty(context->directories[t].paths))
2441 continue;
2442
2443 if (exec_directory_is_private(context, t) &&
2444 !(context->root_directory || context->root_image)) {
2445 char *private_root;
2446
2447 /* So this is for a dynamic user, and we need to make sure the process can access its own
2448 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2449 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2450
2451 private_root = path_join(params->prefix[t], "private");
2452 if (!private_root) {
2453 r = -ENOMEM;
2454 goto finish;
2455 }
2456
2457 r = strv_consume(&empty_directories, private_root);
2458 if (r < 0)
2459 goto finish;
2460 }
2461
2462 STRV_FOREACH(suffix, context->directories[t].paths) {
2463 char *s, *d;
2464
2465 if (exec_directory_is_private(context, t))
2466 s = path_join(params->prefix[t], "private", *suffix);
2467 else
2468 s = path_join(params->prefix[t], *suffix);
2469 if (!s) {
2470 r = -ENOMEM;
2471 goto finish;
2472 }
2473
2474 if (exec_directory_is_private(context, t) &&
2475 (context->root_directory || context->root_image))
2476 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2477 * directory is not created on the root directory. So, let's bind-mount the directory
2478 * on the 'non-private' place. */
2479 d = path_join(params->prefix[t], *suffix);
2480 else
2481 d = strdup(s);
2482 if (!d) {
2483 free(s);
2484 r = -ENOMEM;
2485 goto finish;
2486 }
2487
2488 bind_mounts[h++] = (BindMount) {
2489 .source = s,
2490 .destination = d,
2491 .read_only = false,
2492 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2493 .recursive = true,
2494 .ignore_enoent = false,
2495 };
2496 }
2497 }
2498
2499 assert(h == n);
2500
2501 *ret_bind_mounts = bind_mounts;
2502 *ret_n_bind_mounts = n;
2503 *ret_empty_directories = TAKE_PTR(empty_directories);
2504
2505 return (int) n;
2506
2507 finish:
2508 bind_mount_free_many(bind_mounts, h);
2509 return r;
2510 }
2511
2512 static bool insist_on_sandboxing(
2513 const ExecContext *context,
2514 const char *root_dir,
2515 const char *root_image,
2516 const BindMount *bind_mounts,
2517 size_t n_bind_mounts) {
2518
2519 size_t i;
2520
2521 assert(context);
2522 assert(n_bind_mounts == 0 || bind_mounts);
2523
2524 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2525 * would alter the view on the file system beyond making things read-only or invisble, i.e. would
2526 * rearrange stuff in a way we cannot ignore gracefully. */
2527
2528 if (context->n_temporary_filesystems > 0)
2529 return true;
2530
2531 if (root_dir || root_image)
2532 return true;
2533
2534 if (context->dynamic_user)
2535 return true;
2536
2537 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2538 * essential. */
2539 for (i = 0; i < n_bind_mounts; i++)
2540 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2541 return true;
2542
2543 if (context->log_namespace)
2544 return true;
2545
2546 return false;
2547 }
2548
2549 static int apply_mount_namespace(
2550 const Unit *u,
2551 const ExecCommand *command,
2552 const ExecContext *context,
2553 const ExecParameters *params,
2554 const ExecRuntime *runtime,
2555 char **error_path) {
2556
2557 _cleanup_strv_free_ char **empty_directories = NULL;
2558 char *tmp = NULL, *var = NULL;
2559 const char *root_dir = NULL, *root_image = NULL;
2560 NamespaceInfo ns_info;
2561 bool needs_sandboxing;
2562 BindMount *bind_mounts = NULL;
2563 size_t n_bind_mounts = 0;
2564 int r;
2565
2566 assert(context);
2567
2568 if (params->flags & EXEC_APPLY_CHROOT) {
2569 root_image = context->root_image;
2570
2571 if (!root_image)
2572 root_dir = context->root_directory;
2573 }
2574
2575 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2576 if (r < 0)
2577 return r;
2578
2579 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2580 if (needs_sandboxing) {
2581 /* The runtime struct only contains the parent of the private /tmp,
2582 * which is non-accessible to world users. Inside of it there's a /tmp
2583 * that is sticky, and that's the one we want to use here. */
2584
2585 if (context->private_tmp && runtime) {
2586 if (runtime->tmp_dir)
2587 tmp = strjoina(runtime->tmp_dir, "/tmp");
2588 if (runtime->var_tmp_dir)
2589 var = strjoina(runtime->var_tmp_dir, "/tmp");
2590 }
2591
2592 ns_info = (NamespaceInfo) {
2593 .ignore_protect_paths = false,
2594 .private_dev = context->private_devices,
2595 .protect_control_groups = context->protect_control_groups,
2596 .protect_kernel_tunables = context->protect_kernel_tunables,
2597 .protect_kernel_modules = context->protect_kernel_modules,
2598 .protect_kernel_logs = context->protect_kernel_logs,
2599 .protect_hostname = context->protect_hostname,
2600 .mount_apivfs = context->mount_apivfs,
2601 .private_mounts = context->private_mounts,
2602 };
2603 } else if (!context->dynamic_user && root_dir)
2604 /*
2605 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2606 * sandbox info, otherwise enforce it, don't ignore protected paths and
2607 * fail if we are enable to apply the sandbox inside the mount namespace.
2608 */
2609 ns_info = (NamespaceInfo) {
2610 .ignore_protect_paths = true,
2611 };
2612 else
2613 ns_info = (NamespaceInfo) {};
2614
2615 if (context->mount_flags == MS_SHARED)
2616 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2617
2618 r = setup_namespace(root_dir, root_image,
2619 &ns_info, context->read_write_paths,
2620 needs_sandboxing ? context->read_only_paths : NULL,
2621 needs_sandboxing ? context->inaccessible_paths : NULL,
2622 empty_directories,
2623 bind_mounts,
2624 n_bind_mounts,
2625 context->temporary_filesystems,
2626 context->n_temporary_filesystems,
2627 tmp,
2628 var,
2629 context->log_namespace,
2630 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2631 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2632 context->mount_flags,
2633 DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2634 error_path);
2635
2636 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2637 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2638 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2639 * completely different execution environment. */
2640 if (r == -ENOANO) {
2641 if (insist_on_sandboxing(
2642 context,
2643 root_dir, root_image,
2644 bind_mounts,
2645 n_bind_mounts)) {
2646 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2647 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2648 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2649
2650 r = -EOPNOTSUPP;
2651 } else {
2652 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2653 r = 0;
2654 }
2655 }
2656
2657 bind_mount_free_many(bind_mounts, n_bind_mounts);
2658 return r;
2659 }
2660
2661 static int apply_working_directory(
2662 const ExecContext *context,
2663 const ExecParameters *params,
2664 const char *home,
2665 int *exit_status) {
2666
2667 const char *d, *wd;
2668
2669 assert(context);
2670 assert(exit_status);
2671
2672 if (context->working_directory_home) {
2673
2674 if (!home) {
2675 *exit_status = EXIT_CHDIR;
2676 return -ENXIO;
2677 }
2678
2679 wd = home;
2680
2681 } else if (context->working_directory)
2682 wd = context->working_directory;
2683 else
2684 wd = "/";
2685
2686 if (params->flags & EXEC_APPLY_CHROOT)
2687 d = wd;
2688 else
2689 d = prefix_roota(context->root_directory, wd);
2690
2691 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2692 *exit_status = EXIT_CHDIR;
2693 return -errno;
2694 }
2695
2696 return 0;
2697 }
2698
2699 static int apply_root_directory(
2700 const ExecContext *context,
2701 const ExecParameters *params,
2702 const bool needs_mount_ns,
2703 int *exit_status) {
2704
2705 assert(context);
2706 assert(exit_status);
2707
2708 if (params->flags & EXEC_APPLY_CHROOT) {
2709 if (!needs_mount_ns && context->root_directory)
2710 if (chroot(context->root_directory) < 0) {
2711 *exit_status = EXIT_CHROOT;
2712 return -errno;
2713 }
2714 }
2715
2716 return 0;
2717 }
2718
2719 static int setup_keyring(
2720 const Unit *u,
2721 const ExecContext *context,
2722 const ExecParameters *p,
2723 uid_t uid, gid_t gid) {
2724
2725 key_serial_t keyring;
2726 int r = 0;
2727 uid_t saved_uid;
2728 gid_t saved_gid;
2729
2730 assert(u);
2731 assert(context);
2732 assert(p);
2733
2734 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2735 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2736 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2737 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2738 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2739 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2740
2741 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2742 return 0;
2743
2744 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2745 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2746 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2747 * & group is just as nasty as acquiring a reference to the user keyring. */
2748
2749 saved_uid = getuid();
2750 saved_gid = getgid();
2751
2752 if (gid_is_valid(gid) && gid != saved_gid) {
2753 if (setregid(gid, -1) < 0)
2754 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2755 }
2756
2757 if (uid_is_valid(uid) && uid != saved_uid) {
2758 if (setreuid(uid, -1) < 0) {
2759 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2760 goto out;
2761 }
2762 }
2763
2764 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2765 if (keyring == -1) {
2766 if (errno == ENOSYS)
2767 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2768 else if (IN_SET(errno, EACCES, EPERM))
2769 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2770 else if (errno == EDQUOT)
2771 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2772 else
2773 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2774
2775 goto out;
2776 }
2777
2778 /* When requested link the user keyring into the session keyring. */
2779 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2780
2781 if (keyctl(KEYCTL_LINK,
2782 KEY_SPEC_USER_KEYRING,
2783 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2784 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2785 goto out;
2786 }
2787 }
2788
2789 /* Restore uid/gid back */
2790 if (uid_is_valid(uid) && uid != saved_uid) {
2791 if (setreuid(saved_uid, -1) < 0) {
2792 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2793 goto out;
2794 }
2795 }
2796
2797 if (gid_is_valid(gid) && gid != saved_gid) {
2798 if (setregid(saved_gid, -1) < 0)
2799 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2800 }
2801
2802 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2803 if (!sd_id128_is_null(u->invocation_id)) {
2804 key_serial_t key;
2805
2806 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2807 if (key == -1)
2808 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2809 else {
2810 if (keyctl(KEYCTL_SETPERM, key,
2811 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2812 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2813 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2814 }
2815 }
2816
2817 out:
2818 /* Revert back uid & gid for the the last time, and exit */
2819 /* no extra logging, as only the first already reported error matters */
2820 if (getuid() != saved_uid)
2821 (void) setreuid(saved_uid, -1);
2822
2823 if (getgid() != saved_gid)
2824 (void) setregid(saved_gid, -1);
2825
2826 return r;
2827 }
2828
2829 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2830 assert(array);
2831 assert(n);
2832 assert(pair);
2833
2834 if (pair[0] >= 0)
2835 array[(*n)++] = pair[0];
2836 if (pair[1] >= 0)
2837 array[(*n)++] = pair[1];
2838 }
2839
2840 static int close_remaining_fds(
2841 const ExecParameters *params,
2842 const ExecRuntime *runtime,
2843 const DynamicCreds *dcreds,
2844 int user_lookup_fd,
2845 int socket_fd,
2846 int exec_fd,
2847 const int *fds, size_t n_fds) {
2848
2849 size_t n_dont_close = 0;
2850 int dont_close[n_fds + 12];
2851
2852 assert(params);
2853
2854 if (params->stdin_fd >= 0)
2855 dont_close[n_dont_close++] = params->stdin_fd;
2856 if (params->stdout_fd >= 0)
2857 dont_close[n_dont_close++] = params->stdout_fd;
2858 if (params->stderr_fd >= 0)
2859 dont_close[n_dont_close++] = params->stderr_fd;
2860
2861 if (socket_fd >= 0)
2862 dont_close[n_dont_close++] = socket_fd;
2863 if (exec_fd >= 0)
2864 dont_close[n_dont_close++] = exec_fd;
2865 if (n_fds > 0) {
2866 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2867 n_dont_close += n_fds;
2868 }
2869
2870 if (runtime)
2871 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2872
2873 if (dcreds) {
2874 if (dcreds->user)
2875 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2876 if (dcreds->group)
2877 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2878 }
2879
2880 if (user_lookup_fd >= 0)
2881 dont_close[n_dont_close++] = user_lookup_fd;
2882
2883 return close_all_fds(dont_close, n_dont_close);
2884 }
2885
2886 static int send_user_lookup(
2887 Unit *unit,
2888 int user_lookup_fd,
2889 uid_t uid,
2890 gid_t gid) {
2891
2892 assert(unit);
2893
2894 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2895 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2896 * specified. */
2897
2898 if (user_lookup_fd < 0)
2899 return 0;
2900
2901 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2902 return 0;
2903
2904 if (writev(user_lookup_fd,
2905 (struct iovec[]) {
2906 IOVEC_INIT(&uid, sizeof(uid)),
2907 IOVEC_INIT(&gid, sizeof(gid)),
2908 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2909 return -errno;
2910
2911 return 0;
2912 }
2913
2914 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2915 int r;
2916
2917 assert(c);
2918 assert(home);
2919 assert(buf);
2920
2921 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2922
2923 if (*home)
2924 return 0;
2925
2926 if (!c->working_directory_home)
2927 return 0;
2928
2929 r = get_home_dir(buf);
2930 if (r < 0)
2931 return r;
2932
2933 *home = *buf;
2934 return 1;
2935 }
2936
2937 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2938 _cleanup_strv_free_ char ** list = NULL;
2939 ExecDirectoryType t;
2940 int r;
2941
2942 assert(c);
2943 assert(p);
2944 assert(ret);
2945
2946 assert(c->dynamic_user);
2947
2948 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2949 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2950 * directories. */
2951
2952 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2953 char **i;
2954
2955 if (t == EXEC_DIRECTORY_CONFIGURATION)
2956 continue;
2957
2958 if (!p->prefix[t])
2959 continue;
2960
2961 STRV_FOREACH(i, c->directories[t].paths) {
2962 char *e;
2963
2964 if (exec_directory_is_private(c, t))
2965 e = path_join(p->prefix[t], "private", *i);
2966 else
2967 e = path_join(p->prefix[t], *i);
2968 if (!e)
2969 return -ENOMEM;
2970
2971 r = strv_consume(&list, e);
2972 if (r < 0)
2973 return r;
2974 }
2975 }
2976
2977 *ret = TAKE_PTR(list);
2978
2979 return 0;
2980 }
2981
2982 static char *exec_command_line(char **argv);
2983
2984 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2985 bool using_subcgroup;
2986 char *p;
2987
2988 assert(params);
2989 assert(ret);
2990
2991 if (!params->cgroup_path)
2992 return -EINVAL;
2993
2994 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2995 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2996 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2997 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2998 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2999 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3000 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3001 * flag, which is only passed for the former statements, not for the latter. */
3002
3003 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3004 if (using_subcgroup)
3005 p = path_join(params->cgroup_path, ".control");
3006 else
3007 p = strdup(params->cgroup_path);
3008 if (!p)
3009 return -ENOMEM;
3010
3011 *ret = p;
3012 return using_subcgroup;
3013 }
3014
3015 static int exec_child(
3016 Unit *unit,
3017 const ExecCommand *command,
3018 const ExecContext *context,
3019 const ExecParameters *params,
3020 ExecRuntime *runtime,
3021 DynamicCreds *dcreds,
3022 int socket_fd,
3023 const int named_iofds[static 3],
3024 int *fds,
3025 size_t n_socket_fds,
3026 size_t n_storage_fds,
3027 char **files_env,
3028 int user_lookup_fd,
3029 int *exit_status) {
3030
3031 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3032 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3033 _cleanup_free_ gid_t *supplementary_gids = NULL;
3034 const char *username = NULL, *groupname = NULL;
3035 _cleanup_free_ char *home_buffer = NULL;
3036 const char *home = NULL, *shell = NULL;
3037 char **final_argv = NULL;
3038 dev_t journal_stream_dev = 0;
3039 ino_t journal_stream_ino = 0;
3040 bool userns_set_up = false;
3041 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3042 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3043 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3044 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3045 #if HAVE_SELINUX
3046 _cleanup_free_ char *mac_selinux_context_net = NULL;
3047 bool use_selinux = false;
3048 #endif
3049 #if ENABLE_SMACK
3050 bool use_smack = false;
3051 #endif
3052 #if HAVE_APPARMOR
3053 bool use_apparmor = false;
3054 #endif
3055 uid_t saved_uid = getuid();
3056 gid_t saved_gid = getgid();
3057 uid_t uid = UID_INVALID;
3058 gid_t gid = GID_INVALID;
3059 size_t n_fds;
3060 ExecDirectoryType dt;
3061 int secure_bits;
3062 _cleanup_free_ gid_t *gids_after_pam = NULL;
3063 int ngids_after_pam = 0;
3064
3065 assert(unit);
3066 assert(command);
3067 assert(context);
3068 assert(params);
3069 assert(exit_status);
3070
3071 rename_process_from_path(command->path);
3072
3073 /* We reset exactly these signals, since they are the
3074 * only ones we set to SIG_IGN in the main daemon. All
3075 * others we leave untouched because we set them to
3076 * SIG_DFL or a valid handler initially, both of which
3077 * will be demoted to SIG_DFL. */
3078 (void) default_signals(SIGNALS_CRASH_HANDLER,
3079 SIGNALS_IGNORE, -1);
3080
3081 if (context->ignore_sigpipe)
3082 (void) ignore_signals(SIGPIPE, -1);
3083
3084 r = reset_signal_mask();
3085 if (r < 0) {
3086 *exit_status = EXIT_SIGNAL_MASK;
3087 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3088 }
3089
3090 if (params->idle_pipe)
3091 do_idle_pipe_dance(params->idle_pipe);
3092
3093 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3094 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3095 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3096 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3097
3098 log_forget_fds();
3099 log_set_open_when_needed(true);
3100
3101 /* In case anything used libc syslog(), close this here, too */
3102 closelog();
3103
3104 n_fds = n_socket_fds + n_storage_fds;
3105 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3106 if (r < 0) {
3107 *exit_status = EXIT_FDS;
3108 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3109 }
3110
3111 if (!context->same_pgrp)
3112 if (setsid() < 0) {
3113 *exit_status = EXIT_SETSID;
3114 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3115 }
3116
3117 exec_context_tty_reset(context, params);
3118
3119 if (unit_shall_confirm_spawn(unit)) {
3120 const char *vc = params->confirm_spawn;
3121 _cleanup_free_ char *cmdline = NULL;
3122
3123 cmdline = exec_command_line(command->argv);
3124 if (!cmdline) {
3125 *exit_status = EXIT_MEMORY;
3126 return log_oom();
3127 }
3128
3129 r = ask_for_confirmation(vc, unit, cmdline);
3130 if (r != CONFIRM_EXECUTE) {
3131 if (r == CONFIRM_PRETEND_SUCCESS) {
3132 *exit_status = EXIT_SUCCESS;
3133 return 0;
3134 }
3135 *exit_status = EXIT_CONFIRM;
3136 log_unit_error(unit, "Execution cancelled by the user");
3137 return -ECANCELED;
3138 }
3139 }
3140
3141 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3142 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3143 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3144 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3145 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3146 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3147 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3148 *exit_status = EXIT_MEMORY;
3149 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3150 }
3151
3152 if (context->dynamic_user && dcreds) {
3153 _cleanup_strv_free_ char **suggested_paths = NULL;
3154
3155 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3156 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3157 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3158 *exit_status = EXIT_USER;
3159 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3160 }
3161
3162 r = compile_suggested_paths(context, params, &suggested_paths);
3163 if (r < 0) {
3164 *exit_status = EXIT_MEMORY;
3165 return log_oom();
3166 }
3167
3168 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3169 if (r < 0) {
3170 *exit_status = EXIT_USER;
3171 if (r == -EILSEQ) {
3172 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3173 return -EOPNOTSUPP;
3174 }
3175 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3176 }
3177
3178 if (!uid_is_valid(uid)) {
3179 *exit_status = EXIT_USER;
3180 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3181 return -ESRCH;
3182 }
3183
3184 if (!gid_is_valid(gid)) {
3185 *exit_status = EXIT_USER;
3186 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3187 return -ESRCH;
3188 }
3189
3190 if (dcreds->user)
3191 username = dcreds->user->name;
3192
3193 } else {
3194 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3195 if (r < 0) {
3196 *exit_status = EXIT_USER;
3197 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3198 }
3199
3200 r = get_fixed_group(context, &groupname, &gid);
3201 if (r < 0) {
3202 *exit_status = EXIT_GROUP;
3203 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3204 }
3205 }
3206
3207 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3208 r = get_supplementary_groups(context, username, groupname, gid,
3209 &supplementary_gids, &ngids);
3210 if (r < 0) {
3211 *exit_status = EXIT_GROUP;
3212 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3213 }
3214
3215 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3216 if (r < 0) {
3217 *exit_status = EXIT_USER;
3218 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3219 }
3220
3221 user_lookup_fd = safe_close(user_lookup_fd);
3222
3223 r = acquire_home(context, uid, &home, &home_buffer);
3224 if (r < 0) {
3225 *exit_status = EXIT_CHDIR;
3226 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3227 }
3228
3229 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3230 * must sure to drop O_NONBLOCK */
3231 if (socket_fd >= 0)
3232 (void) fd_nonblock(socket_fd, false);
3233
3234 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3235 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3236 if (params->cgroup_path) {
3237 _cleanup_free_ char *p = NULL;
3238
3239 r = exec_parameters_get_cgroup_path(params, &p);
3240 if (r < 0) {
3241 *exit_status = EXIT_CGROUP;
3242 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3243 }
3244
3245 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3246 if (r < 0) {
3247 *exit_status = EXIT_CGROUP;
3248 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3249 }
3250 }
3251
3252 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3253 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3254 if (r < 0) {
3255 *exit_status = EXIT_NETWORK;
3256 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3257 }
3258 }
3259
3260 r = setup_input(context, params, socket_fd, named_iofds);
3261 if (r < 0) {
3262 *exit_status = EXIT_STDIN;
3263 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3264 }
3265
3266 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3267 if (r < 0) {
3268 *exit_status = EXIT_STDOUT;
3269 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3270 }
3271
3272 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3273 if (r < 0) {
3274 *exit_status = EXIT_STDERR;
3275 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3276 }
3277
3278 if (context->oom_score_adjust_set) {
3279 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3280 * prohibit write access to this file, and we shouldn't trip up over that. */
3281 r = set_oom_score_adjust(context->oom_score_adjust);
3282 if (IN_SET(r, -EPERM, -EACCES))
3283 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3284 else if (r < 0) {
3285 *exit_status = EXIT_OOM_ADJUST;
3286 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3287 }
3288 }
3289
3290 if (context->nice_set) {
3291 r = setpriority_closest(context->nice);
3292 if (r < 0)
3293 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3294 }
3295
3296 if (context->cpu_sched_set) {
3297 struct sched_param param = {
3298 .sched_priority = context->cpu_sched_priority,
3299 };
3300
3301 r = sched_setscheduler(0,
3302 context->cpu_sched_policy |
3303 (context->cpu_sched_reset_on_fork ?
3304 SCHED_RESET_ON_FORK : 0),
3305 &param);
3306 if (r < 0) {
3307 *exit_status = EXIT_SETSCHEDULER;
3308 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3309 }
3310 }
3311
3312 if (context->cpu_set.set)
3313 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3314 *exit_status = EXIT_CPUAFFINITY;
3315 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3316 }
3317
3318 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3319 r = apply_numa_policy(&context->numa_policy);
3320 if (r == -EOPNOTSUPP)
3321 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3322 else if (r < 0) {
3323 *exit_status = EXIT_NUMA_POLICY;
3324 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3325 }
3326 }
3327
3328 if (context->ioprio_set)
3329 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3330 *exit_status = EXIT_IOPRIO;
3331 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3332 }
3333
3334 if (context->timer_slack_nsec != NSEC_INFINITY)
3335 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3336 *exit_status = EXIT_TIMERSLACK;
3337 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3338 }
3339
3340 if (context->personality != PERSONALITY_INVALID) {
3341 r = safe_personality(context->personality);
3342 if (r < 0) {
3343 *exit_status = EXIT_PERSONALITY;
3344 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3345 }
3346 }
3347
3348 if (context->utmp_id)
3349 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3350 context->tty_path,
3351 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3352 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3353 USER_PROCESS,
3354 username);
3355
3356 if (uid_is_valid(uid)) {
3357 r = chown_terminal(STDIN_FILENO, uid);
3358 if (r < 0) {
3359 *exit_status = EXIT_STDIN;
3360 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3361 }
3362 }
3363
3364 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3365 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3366 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3367 * touch a single hierarchy too. */
3368 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3369 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3370 if (r < 0) {
3371 *exit_status = EXIT_CGROUP;
3372 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3373 }
3374 }
3375
3376 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3377 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3378 if (r < 0)
3379 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3380 }
3381
3382 r = build_environment(
3383 unit,
3384 context,
3385 params,
3386 n_fds,
3387 home,
3388 username,
3389 shell,
3390 journal_stream_dev,
3391 journal_stream_ino,
3392 &our_env);
3393 if (r < 0) {
3394 *exit_status = EXIT_MEMORY;
3395 return log_oom();
3396 }
3397
3398 r = build_pass_environment(context, &pass_env);
3399 if (r < 0) {
3400 *exit_status = EXIT_MEMORY;
3401 return log_oom();
3402 }
3403
3404 accum_env = strv_env_merge(5,
3405 params->environment,
3406 our_env,
3407 pass_env,
3408 context->environment,
3409 files_env,
3410 NULL);
3411 if (!accum_env) {
3412 *exit_status = EXIT_MEMORY;
3413 return log_oom();
3414 }
3415 accum_env = strv_env_clean(accum_env);
3416
3417 (void) umask(context->umask);
3418
3419 r = setup_keyring(unit, context, params, uid, gid);
3420 if (r < 0) {
3421 *exit_status = EXIT_KEYRING;
3422 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3423 }
3424
3425 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3426 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3427
3428 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3429 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3430
3431 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3432 if (needs_ambient_hack)
3433 needs_setuid = false;
3434 else
3435 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3436
3437 if (needs_sandboxing) {
3438 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3439 * present. The actual MAC context application will happen later, as late as possible, to avoid
3440 * impacting our own code paths. */
3441
3442 #if HAVE_SELINUX
3443 use_selinux = mac_selinux_use();
3444 #endif
3445 #if ENABLE_SMACK
3446 use_smack = mac_smack_use();
3447 #endif
3448 #if HAVE_APPARMOR
3449 use_apparmor = mac_apparmor_use();
3450 #endif
3451 }
3452
3453 if (needs_sandboxing) {
3454 int which_failed;
3455
3456 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3457 * is set here. (See below.) */
3458
3459 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3460 if (r < 0) {
3461 *exit_status = EXIT_LIMITS;
3462 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3463 }
3464 }
3465
3466 if (needs_setuid) {
3467
3468 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3469 * wins here. (See above.) */
3470
3471 if (context->pam_name && username) {
3472 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3473 if (r < 0) {
3474 *exit_status = EXIT_PAM;
3475 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3476 }
3477
3478 ngids_after_pam = getgroups_alloc(&gids_after_pam);
3479 if (ngids_after_pam < 0) {
3480 *exit_status = EXIT_MEMORY;
3481 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3482 }
3483 }
3484 }
3485
3486 if (needs_sandboxing) {
3487 #if HAVE_SELINUX
3488 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3489 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3490 if (r < 0) {
3491 *exit_status = EXIT_SELINUX_CONTEXT;
3492 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3493 }
3494 }
3495 #endif
3496
3497 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3498 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3499 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3500 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3501 userns_set_up = true;
3502 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3503 if (r < 0) {
3504 *exit_status = EXIT_USER;
3505 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3506 }
3507 }
3508 }
3509
3510 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3511
3512 if (ns_type_supported(NAMESPACE_NET)) {
3513 r = setup_netns(runtime->netns_storage_socket);
3514 if (r == -EPERM)
3515 log_unit_warning_errno(unit, r,
3516 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3517 else if (r < 0) {
3518 *exit_status = EXIT_NETWORK;
3519 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3520 }
3521 } else if (context->network_namespace_path) {
3522 *exit_status = EXIT_NETWORK;
3523 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3524 "NetworkNamespacePath= is not supported, refusing.");
3525 } else
3526 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3527 }
3528
3529 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3530 if (needs_mount_namespace) {
3531 _cleanup_free_ char *error_path = NULL;
3532
3533 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3534 if (r < 0) {
3535 *exit_status = EXIT_NAMESPACE;
3536 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3537 error_path ? ": " : "", strempty(error_path));
3538 }
3539 }
3540
3541 if (context->protect_hostname) {
3542 if (ns_type_supported(NAMESPACE_UTS)) {
3543 if (unshare(CLONE_NEWUTS) < 0) {
3544 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
3545 *exit_status = EXIT_NAMESPACE;
3546 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3547 }
3548
3549 log_unit_warning(unit, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
3550 }
3551 } else
3552 log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3553 #if HAVE_SECCOMP
3554 r = seccomp_protect_hostname();
3555 if (r < 0) {
3556 *exit_status = EXIT_SECCOMP;
3557 return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3558 }
3559 #endif
3560 }
3561
3562 /* Drop groups as early as possible.
3563 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3564 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3565 if (needs_setuid) {
3566 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3567 int ngids_to_enforce = 0;
3568
3569 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3570 ngids,
3571 gids_after_pam,
3572 ngids_after_pam,
3573 &gids_to_enforce);
3574 if (ngids_to_enforce < 0) {
3575 *exit_status = EXIT_MEMORY;
3576 return log_unit_error_errno(unit,
3577 ngids_to_enforce,
3578 "Failed to merge group lists. Group membership might be incorrect: %m");
3579 }
3580
3581 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3582 if (r < 0) {
3583 *exit_status = EXIT_GROUP;
3584 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3585 }
3586 }
3587
3588 /* If the user namespace was not set up above, try to do it now.
3589 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3590 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3591 * case of mount namespaces being less privileged when the mount point list is copied from a
3592 * different user namespace). */
3593
3594 if (needs_sandboxing && context->private_users && !userns_set_up) {
3595 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3596 if (r < 0) {
3597 *exit_status = EXIT_USER;
3598 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3599 }
3600 }
3601
3602 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3603 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3604 * however if we have it as we want to keep it open until the final execve(). */
3605
3606 if (params->exec_fd >= 0) {
3607 exec_fd = params->exec_fd;
3608
3609 if (exec_fd < 3 + (int) n_fds) {
3610 int moved_fd;
3611
3612 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3613 * process we are about to execute. */
3614
3615 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3616 if (moved_fd < 0) {
3617 *exit_status = EXIT_FDS;
3618 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3619 }
3620
3621 safe_close(exec_fd);
3622 exec_fd = moved_fd;
3623 } else {
3624 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3625 r = fd_cloexec(exec_fd, true);
3626 if (r < 0) {
3627 *exit_status = EXIT_FDS;
3628 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3629 }
3630 }
3631
3632 fds_with_exec_fd = newa(int, n_fds + 1);
3633 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3634 fds_with_exec_fd[n_fds] = exec_fd;
3635 n_fds_with_exec_fd = n_fds + 1;
3636 } else {
3637 fds_with_exec_fd = fds;
3638 n_fds_with_exec_fd = n_fds;
3639 }
3640
3641 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3642 if (r >= 0)
3643 r = shift_fds(fds, n_fds);
3644 if (r >= 0)
3645 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3646 if (r < 0) {
3647 *exit_status = EXIT_FDS;
3648 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3649 }
3650
3651 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3652 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3653 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3654 * came this far. */
3655
3656 secure_bits = context->secure_bits;
3657
3658 if (needs_sandboxing) {
3659 uint64_t bset;
3660
3661 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3662 * requested. (Note this is placed after the general resource limit initialization, see
3663 * above, in order to take precedence.) */
3664 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3665 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3666 *exit_status = EXIT_LIMITS;
3667 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3668 }
3669 }
3670
3671 #if ENABLE_SMACK
3672 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3673 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3674 if (use_smack) {
3675 r = setup_smack(context, command);
3676 if (r < 0) {
3677 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3678 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3679 }
3680 }
3681 #endif
3682
3683 bset = context->capability_bounding_set;
3684 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3685 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3686 * instead of us doing that */
3687 if (needs_ambient_hack)
3688 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3689 (UINT64_C(1) << CAP_SETUID) |
3690 (UINT64_C(1) << CAP_SETGID);
3691
3692 if (!cap_test_all(bset)) {
3693 r = capability_bounding_set_drop(bset, false);
3694 if (r < 0) {
3695 *exit_status = EXIT_CAPABILITIES;
3696 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3697 }
3698 }
3699
3700 /* This is done before enforce_user, but ambient set
3701 * does not survive over setresuid() if keep_caps is not set. */
3702 if (!needs_ambient_hack) {
3703 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3704 if (r < 0) {
3705 *exit_status = EXIT_CAPABILITIES;
3706 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3707 }
3708 }
3709 }
3710
3711 /* chroot to root directory first, before we lose the ability to chroot */
3712 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3713 if (r < 0)
3714 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3715
3716 if (needs_setuid) {
3717 if (uid_is_valid(uid)) {
3718 r = enforce_user(context, uid);
3719 if (r < 0) {
3720 *exit_status = EXIT_USER;
3721 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3722 }
3723
3724 if (!needs_ambient_hack &&
3725 context->capability_ambient_set != 0) {
3726
3727 /* Fix the ambient capabilities after user change. */
3728 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3729 if (r < 0) {
3730 *exit_status = EXIT_CAPABILITIES;
3731 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3732 }
3733
3734 /* If we were asked to change user and ambient capabilities
3735 * were requested, we had to add keep-caps to the securebits
3736 * so that we would maintain the inherited capability set
3737 * through the setresuid(). Make sure that the bit is added
3738 * also to the context secure_bits so that we don't try to
3739 * drop the bit away next. */
3740
3741 secure_bits |= 1<<SECURE_KEEP_CAPS;
3742 }
3743 }
3744 }
3745
3746 /* Apply working directory here, because the working directory might be on NFS and only the user running
3747 * this service might have the correct privilege to change to the working directory */
3748 r = apply_working_directory(context, params, home, exit_status);
3749 if (r < 0)
3750 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3751
3752 if (needs_sandboxing) {
3753 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3754 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3755 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3756 * are restricted. */
3757
3758 #if HAVE_SELINUX
3759 if (use_selinux) {
3760 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3761
3762 if (exec_context) {
3763 r = setexeccon(exec_context);
3764 if (r < 0) {
3765 *exit_status = EXIT_SELINUX_CONTEXT;
3766 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3767 }
3768 }
3769 }
3770 #endif
3771
3772 #if HAVE_APPARMOR
3773 if (use_apparmor && context->apparmor_profile) {
3774 r = aa_change_onexec(context->apparmor_profile);
3775 if (r < 0 && !context->apparmor_profile_ignore) {
3776 *exit_status = EXIT_APPARMOR_PROFILE;
3777 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3778 }
3779 }
3780 #endif
3781
3782 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3783 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3784 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3785 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3786 *exit_status = EXIT_SECUREBITS;
3787 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3788 }
3789
3790 if (context_has_no_new_privileges(context))
3791 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3792 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3793 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3794 }
3795
3796 #if HAVE_SECCOMP
3797 r = apply_address_families(unit, context);
3798 if (r < 0) {
3799 *exit_status = EXIT_ADDRESS_FAMILIES;
3800 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3801 }
3802
3803 r = apply_memory_deny_write_execute(unit, context);
3804 if (r < 0) {
3805 *exit_status = EXIT_SECCOMP;
3806 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3807 }
3808
3809 r = apply_restrict_realtime(unit, context);
3810 if (r < 0) {
3811 *exit_status = EXIT_SECCOMP;
3812 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3813 }
3814
3815 r = apply_restrict_suid_sgid(unit, context);
3816 if (r < 0) {
3817 *exit_status = EXIT_SECCOMP;
3818 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3819 }
3820
3821 r = apply_restrict_namespaces(unit, context);
3822 if (r < 0) {
3823 *exit_status = EXIT_SECCOMP;
3824 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3825 }
3826
3827 r = apply_protect_sysctl(unit, context);
3828 if (r < 0) {
3829 *exit_status = EXIT_SECCOMP;
3830 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3831 }
3832
3833 r = apply_protect_kernel_modules(unit, context);
3834 if (r < 0) {
3835 *exit_status = EXIT_SECCOMP;
3836 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3837 }
3838
3839 r = apply_protect_kernel_logs(unit, context);
3840 if (r < 0) {
3841 *exit_status = EXIT_SECCOMP;
3842 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3843 }
3844
3845 r = apply_protect_clock(unit, context);
3846 if (r < 0) {
3847 *exit_status = EXIT_SECCOMP;
3848 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3849 }
3850
3851 r = apply_private_devices(unit, context);
3852 if (r < 0) {
3853 *exit_status = EXIT_SECCOMP;
3854 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3855 }
3856
3857 r = apply_syscall_archs(unit, context);
3858 if (r < 0) {
3859 *exit_status = EXIT_SECCOMP;
3860 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3861 }
3862
3863 r = apply_lock_personality(unit, context);
3864 if (r < 0) {
3865 *exit_status = EXIT_SECCOMP;
3866 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3867 }
3868
3869 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3870 * by the filter as little as possible. */
3871 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3872 if (r < 0) {
3873 *exit_status = EXIT_SECCOMP;
3874 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3875 }
3876 #endif
3877 }
3878
3879 if (!strv_isempty(context->unset_environment)) {
3880 char **ee = NULL;
3881
3882 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3883 if (!ee) {
3884 *exit_status = EXIT_MEMORY;
3885 return log_oom();
3886 }
3887
3888 strv_free_and_replace(accum_env, ee);
3889 }
3890
3891 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3892 replaced_argv = replace_env_argv(command->argv, accum_env);
3893 if (!replaced_argv) {
3894 *exit_status = EXIT_MEMORY;
3895 return log_oom();
3896 }
3897 final_argv = replaced_argv;
3898 } else
3899 final_argv = command->argv;
3900
3901 if (DEBUG_LOGGING) {
3902 _cleanup_free_ char *line;
3903
3904 line = exec_command_line(final_argv);
3905 if (line)
3906 log_struct(LOG_DEBUG,
3907 "EXECUTABLE=%s", command->path,
3908 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3909 LOG_UNIT_ID(unit),
3910 LOG_UNIT_INVOCATION_ID(unit));
3911 }
3912
3913 if (exec_fd >= 0) {
3914 uint8_t hot = 1;
3915
3916 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3917 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3918
3919 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3920 *exit_status = EXIT_EXEC;
3921 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3922 }
3923 }
3924
3925 execve(command->path, final_argv, accum_env);
3926 r = -errno;
3927
3928 if (exec_fd >= 0) {
3929 uint8_t hot = 0;
3930
3931 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3932 * that POLLHUP on it no longer means execve() succeeded. */
3933
3934 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3935 *exit_status = EXIT_EXEC;
3936 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3937 }
3938 }
3939
3940 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3941 log_struct_errno(LOG_INFO, r,
3942 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3943 LOG_UNIT_ID(unit),
3944 LOG_UNIT_INVOCATION_ID(unit),
3945 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3946 command->path),
3947 "EXECUTABLE=%s", command->path);
3948 return 0;
3949 }
3950
3951 *exit_status = EXIT_EXEC;
3952 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3953 }
3954
3955 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3956 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3957
3958 int exec_spawn(Unit *unit,
3959 ExecCommand *command,
3960 const ExecContext *context,
3961 const ExecParameters *params,
3962 ExecRuntime *runtime,
3963 DynamicCreds *dcreds,
3964 pid_t *ret) {
3965
3966 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3967 _cleanup_free_ char *subcgroup_path = NULL;
3968 _cleanup_strv_free_ char **files_env = NULL;
3969 size_t n_storage_fds = 0, n_socket_fds = 0;
3970 _cleanup_free_ char *line = NULL;
3971 pid_t pid;
3972
3973 assert(unit);
3974 assert(command);
3975 assert(context);
3976 assert(ret);
3977 assert(params);
3978 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3979
3980 if (context->std_input == EXEC_INPUT_SOCKET ||
3981 context->std_output == EXEC_OUTPUT_SOCKET ||
3982 context->std_error == EXEC_OUTPUT_SOCKET) {
3983
3984 if (params->n_socket_fds > 1) {
3985 log_unit_error(unit, "Got more than one socket.");
3986 return -EINVAL;
3987 }
3988
3989 if (params->n_socket_fds == 0) {
3990 log_unit_error(unit, "Got no socket.");
3991 return -EINVAL;
3992 }
3993
3994 socket_fd = params->fds[0];
3995 } else {
3996 socket_fd = -1;
3997 fds = params->fds;
3998 n_socket_fds = params->n_socket_fds;
3999 n_storage_fds = params->n_storage_fds;
4000 }
4001
4002 r = exec_context_named_iofds(context, params, named_iofds);
4003 if (r < 0)
4004 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4005
4006 r = exec_context_load_environment(unit, context, &files_env);
4007 if (r < 0)
4008 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4009
4010 line = exec_command_line(command->argv);
4011 if (!line)
4012 return log_oom();
4013
4014 log_struct(LOG_DEBUG,
4015 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4016 "EXECUTABLE=%s", command->path,
4017 LOG_UNIT_ID(unit),
4018 LOG_UNIT_INVOCATION_ID(unit));
4019
4020 if (params->cgroup_path) {
4021 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4022 if (r < 0)
4023 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4024 if (r > 0) { /* We are using a child cgroup */
4025 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4026 if (r < 0)
4027 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4028 }
4029 }
4030
4031 pid = fork();
4032 if (pid < 0)
4033 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4034
4035 if (pid == 0) {
4036 int exit_status = EXIT_SUCCESS;
4037
4038 r = exec_child(unit,
4039 command,
4040 context,
4041 params,
4042 runtime,
4043 dcreds,
4044 socket_fd,
4045 named_iofds,
4046 fds,
4047 n_socket_fds,
4048 n_storage_fds,
4049 files_env,
4050 unit->manager->user_lookup_fds[1],
4051 &exit_status);
4052
4053 if (r < 0) {
4054 const char *status =
4055 exit_status_to_string(exit_status,
4056 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4057
4058 log_struct_errno(LOG_ERR, r,
4059 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4060 LOG_UNIT_ID(unit),
4061 LOG_UNIT_INVOCATION_ID(unit),
4062 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4063 status, command->path),
4064 "EXECUTABLE=%s", command->path);
4065 }
4066
4067 _exit(exit_status);
4068 }
4069
4070 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4071
4072 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4073 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4074 * process will be killed too). */
4075 if (subcgroup_path)
4076 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4077
4078 exec_status_start(&command->exec_status, pid);
4079
4080 *ret = pid;
4081 return 0;
4082 }
4083
4084 void exec_context_init(ExecContext *c) {
4085 ExecDirectoryType i;
4086
4087 assert(c);
4088
4089 c->umask = 0022;
4090 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4091 c->cpu_sched_policy = SCHED_OTHER;
4092 c->syslog_priority = LOG_DAEMON|LOG_INFO;
4093 c->syslog_level_prefix = true;
4094 c->ignore_sigpipe = true;
4095 c->timer_slack_nsec = NSEC_INFINITY;
4096 c->personality = PERSONALITY_INVALID;
4097 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4098 c->directories[i].mode = 0755;
4099 c->timeout_clean_usec = USEC_INFINITY;
4100 c->capability_bounding_set = CAP_ALL;
4101 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4102 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4103 c->log_level_max = -1;
4104 numa_policy_reset(&c->numa_policy);
4105 }
4106
4107 void exec_context_done(ExecContext *c) {
4108 ExecDirectoryType i;
4109 size_t l;
4110
4111 assert(c);
4112
4113 c->environment = strv_free(c->environment);
4114 c->environment_files = strv_free(c->environment_files);
4115 c->pass_environment = strv_free(c->pass_environment);
4116 c->unset_environment = strv_free(c->unset_environment);
4117
4118 rlimit_free_all(c->rlimit);
4119
4120 for (l = 0; l < 3; l++) {
4121 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4122 c->stdio_file[l] = mfree(c->stdio_file[l]);
4123 }
4124
4125 c->working_directory = mfree(c->working_directory);
4126 c->root_directory = mfree(c->root_directory);
4127 c->root_image = mfree(c->root_image);
4128 c->tty_path = mfree(c->tty_path);
4129 c->syslog_identifier = mfree(c->syslog_identifier);
4130 c->user = mfree(c->user);
4131 c->group = mfree(c->group);
4132
4133 c->supplementary_groups = strv_free(c->supplementary_groups);
4134
4135 c->pam_name = mfree(c->pam_name);
4136
4137 c->read_only_paths = strv_free(c->read_only_paths);
4138 c->read_write_paths = strv_free(c->read_write_paths);
4139 c->inaccessible_paths = strv_free(c->inaccessible_paths);
4140
4141 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4142 c->bind_mounts = NULL;
4143 c->n_bind_mounts = 0;
4144 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4145 c->temporary_filesystems = NULL;
4146 c->n_temporary_filesystems = 0;
4147
4148 cpu_set_reset(&c->cpu_set);
4149 numa_policy_reset(&c->numa_policy);
4150
4151 c->utmp_id = mfree(c->utmp_id);
4152 c->selinux_context = mfree(c->selinux_context);
4153 c->apparmor_profile = mfree(c->apparmor_profile);
4154 c->smack_process_label = mfree(c->smack_process_label);
4155
4156 c->syscall_filter = hashmap_free(c->syscall_filter);
4157 c->syscall_archs = set_free(c->syscall_archs);
4158 c->address_families = set_free(c->address_families);
4159
4160 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4161 c->directories[i].paths = strv_free(c->directories[i].paths);
4162
4163 c->log_level_max = -1;
4164
4165 exec_context_free_log_extra_fields(c);
4166
4167 c->log_ratelimit_interval_usec = 0;
4168 c->log_ratelimit_burst = 0;
4169
4170 c->stdin_data = mfree(c->stdin_data);
4171 c->stdin_data_size = 0;
4172
4173 c->network_namespace_path = mfree(c->network_namespace_path);
4174
4175 c->log_namespace = mfree(c->log_namespace);
4176 }
4177
4178 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4179 char **i;
4180
4181 assert(c);
4182
4183 if (!runtime_prefix)
4184 return 0;
4185
4186 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4187 _cleanup_free_ char *p;
4188
4189 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4190 p = path_join(runtime_prefix, "private", *i);
4191 else
4192 p = path_join(runtime_prefix, *i);
4193 if (!p)
4194 return -ENOMEM;
4195
4196 /* We execute this synchronously, since we need to be sure this is gone when we start the
4197 * service next. */
4198 (void) rm_rf(p, REMOVE_ROOT);
4199 }
4200
4201 return 0;
4202 }
4203
4204 static void exec_command_done(ExecCommand *c) {
4205 assert(c);
4206
4207 c->path = mfree(c->path);
4208 c->argv = strv_free(c->argv);
4209 }
4210
4211 void exec_command_done_array(ExecCommand *c, size_t n) {
4212 size_t i;
4213
4214 for (i = 0; i < n; i++)
4215 exec_command_done(c+i);
4216 }
4217
4218 ExecCommand* exec_command_free_list(ExecCommand *c) {
4219 ExecCommand *i;
4220
4221 while ((i = c)) {
4222 LIST_REMOVE(command, c, i);
4223 exec_command_done(i);
4224 free(i);
4225 }
4226
4227 return NULL;
4228 }
4229
4230 void exec_command_free_array(ExecCommand **c, size_t n) {
4231 size_t i;
4232
4233 for (i = 0; i < n; i++)
4234 c[i] = exec_command_free_list(c[i]);
4235 }
4236
4237 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4238 size_t i;
4239
4240 for (i = 0; i < n; i++)
4241 exec_status_reset(&c[i].exec_status);
4242 }
4243
4244 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4245 size_t i;
4246
4247 for (i = 0; i < n; i++) {
4248 ExecCommand *z;
4249
4250 LIST_FOREACH(command, z, c[i])
4251 exec_status_reset(&z->exec_status);
4252 }
4253 }
4254
4255 typedef struct InvalidEnvInfo {
4256 const Unit *unit;
4257 const char *path;
4258 } InvalidEnvInfo;
4259
4260 static void invalid_env(const char *p, void *userdata) {
4261 InvalidEnvInfo *info = userdata;
4262
4263 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4264 }
4265
4266 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4267 assert(c);
4268
4269 switch (fd_index) {
4270
4271 case STDIN_FILENO:
4272 if (c->std_input != EXEC_INPUT_NAMED_FD)
4273 return NULL;
4274
4275 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4276
4277 case STDOUT_FILENO:
4278 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4279 return NULL;
4280
4281 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4282
4283 case STDERR_FILENO:
4284 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4285 return NULL;
4286
4287 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4288
4289 default:
4290 return NULL;
4291 }
4292 }
4293
4294 static int exec_context_named_iofds(
4295 const ExecContext *c,
4296 const ExecParameters *p,
4297 int named_iofds[static 3]) {
4298
4299 size_t i, targets;
4300 const char* stdio_fdname[3];
4301 size_t n_fds;
4302
4303 assert(c);
4304 assert(p);
4305 assert(named_iofds);
4306
4307 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4308 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4309 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4310
4311 for (i = 0; i < 3; i++)
4312 stdio_fdname[i] = exec_context_fdname(c, i);
4313
4314 n_fds = p->n_storage_fds + p->n_socket_fds;
4315
4316 for (i = 0; i < n_fds && targets > 0; i++)
4317 if (named_iofds[STDIN_FILENO] < 0 &&
4318 c->std_input == EXEC_INPUT_NAMED_FD &&
4319 stdio_fdname[STDIN_FILENO] &&
4320 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4321
4322 named_iofds[STDIN_FILENO] = p->fds[i];
4323 targets--;
4324
4325 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4326 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4327 stdio_fdname[STDOUT_FILENO] &&
4328 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4329
4330 named_iofds[STDOUT_FILENO] = p->fds[i];
4331 targets--;
4332
4333 } else if (named_iofds[STDERR_FILENO] < 0 &&
4334 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4335 stdio_fdname[STDERR_FILENO] &&
4336 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4337
4338 named_iofds[STDERR_FILENO] = p->fds[i];
4339 targets--;
4340 }
4341
4342 return targets == 0 ? 0 : -ENOENT;
4343 }
4344
4345 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4346 char **i, **r = NULL;
4347
4348 assert(c);
4349 assert(l);
4350
4351 STRV_FOREACH(i, c->environment_files) {
4352 char *fn;
4353 int k;
4354 unsigned n;
4355 bool ignore = false;
4356 char **p;
4357 _cleanup_globfree_ glob_t pglob = {};
4358
4359 fn = *i;
4360
4361 if (fn[0] == '-') {
4362 ignore = true;
4363 fn++;
4364 }
4365
4366 if (!path_is_absolute(fn)) {
4367 if (ignore)
4368 continue;
4369
4370 strv_free(r);
4371 return -EINVAL;
4372 }
4373
4374 /* Filename supports globbing, take all matching files */
4375 k = safe_glob(fn, 0, &pglob);
4376 if (k < 0) {
4377 if (ignore)
4378 continue;
4379
4380 strv_free(r);
4381 return k;
4382 }
4383
4384 /* When we don't match anything, -ENOENT should be returned */
4385 assert(pglob.gl_pathc > 0);
4386
4387 for (n = 0; n < pglob.gl_pathc; n++) {
4388 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4389 if (k < 0) {
4390 if (ignore)
4391 continue;
4392
4393 strv_free(r);
4394 return k;
4395 }
4396 /* Log invalid environment variables with filename */
4397 if (p) {
4398 InvalidEnvInfo info = {
4399 .unit = unit,
4400 .path = pglob.gl_pathv[n]
4401 };
4402
4403 p = strv_env_clean_with_callback(p, invalid_env, &info);
4404 }
4405
4406 if (!r)
4407 r = p;
4408 else {
4409 char **m;
4410
4411 m = strv_env_merge(2, r, p);
4412 strv_free(r);
4413 strv_free(p);
4414 if (!m)
4415 return -ENOMEM;
4416
4417 r = m;
4418 }
4419 }
4420 }
4421
4422 *l = r;
4423
4424 return 0;
4425 }
4426
4427 static bool tty_may_match_dev_console(const char *tty) {
4428 _cleanup_free_ char *resolved = NULL;
4429
4430 if (!tty)
4431 return true;
4432
4433 tty = skip_dev_prefix(tty);
4434
4435 /* trivial identity? */
4436 if (streq(tty, "console"))
4437 return true;
4438
4439 if (resolve_dev_console(&resolved) < 0)
4440 return true; /* if we could not resolve, assume it may */
4441
4442 /* "tty0" means the active VC, so it may be the same sometimes */
4443 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4444 }
4445
4446 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4447 assert(ec);
4448
4449 return ec->tty_reset ||
4450 ec->tty_vhangup ||
4451 ec->tty_vt_disallocate ||
4452 is_terminal_input(ec->std_input) ||
4453 is_terminal_output(ec->std_output) ||
4454 is_terminal_output(ec->std_error);
4455 }
4456
4457 bool exec_context_may_touch_console(const ExecContext *ec) {
4458
4459 return exec_context_may_touch_tty(ec) &&
4460 tty_may_match_dev_console(exec_context_tty_path(ec));
4461 }
4462
4463 static void strv_fprintf(FILE *f, char **l) {
4464 char **g;
4465
4466 assert(f);
4467
4468 STRV_FOREACH(g, l)
4469 fprintf(f, " %s", *g);
4470 }
4471
4472 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4473 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4474 ExecDirectoryType dt;
4475 unsigned i;
4476 int r;
4477
4478 assert(c);
4479 assert(f);
4480
4481 prefix = strempty(prefix);
4482
4483 fprintf(f,
4484 "%sUMask: %04o\n"
4485 "%sWorkingDirectory: %s\n"
4486 "%sRootDirectory: %s\n"
4487 "%sNonBlocking: %s\n"
4488 "%sPrivateTmp: %s\n"
4489 "%sPrivateDevices: %s\n"
4490 "%sProtectKernelTunables: %s\n"
4491 "%sProtectKernelModules: %s\n"
4492 "%sProtectKernelLogs: %s\n"
4493 "%sProtectClock: %s\n"
4494 "%sProtectControlGroups: %s\n"
4495 "%sPrivateNetwork: %s\n"
4496 "%sPrivateUsers: %s\n"
4497 "%sProtectHome: %s\n"
4498 "%sProtectSystem: %s\n"
4499 "%sMountAPIVFS: %s\n"
4500 "%sIgnoreSIGPIPE: %s\n"
4501 "%sMemoryDenyWriteExecute: %s\n"
4502 "%sRestrictRealtime: %s\n"
4503 "%sRestrictSUIDSGID: %s\n"
4504 "%sKeyringMode: %s\n"
4505 "%sProtectHostname: %s\n",
4506 prefix, c->umask,
4507 prefix, c->working_directory ? c->working_directory : "/",
4508 prefix, c->root_directory ? c->root_directory : "/",
4509 prefix, yes_no(c->non_blocking),
4510 prefix, yes_no(c->private_tmp),
4511 prefix, yes_no(c->private_devices),
4512 prefix, yes_no(c->protect_kernel_tunables),
4513 prefix, yes_no(c->protect_kernel_modules),
4514 prefix, yes_no(c->protect_kernel_logs),
4515 prefix, yes_no(c->protect_clock),
4516 prefix, yes_no(c->protect_control_groups),
4517 prefix, yes_no(c->private_network),
4518 prefix, yes_no(c->private_users),
4519 prefix, protect_home_to_string(c->protect_home),
4520 prefix, protect_system_to_string(c->protect_system),
4521 prefix, yes_no(c->mount_apivfs),
4522 prefix, yes_no(c->ignore_sigpipe),
4523 prefix, yes_no(c->memory_deny_write_execute),
4524 prefix, yes_no(c->restrict_realtime),
4525 prefix, yes_no(c->restrict_suid_sgid),
4526 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4527 prefix, yes_no(c->protect_hostname));
4528
4529 if (c->root_image)
4530 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4531
4532 STRV_FOREACH(e, c->environment)
4533 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4534
4535 STRV_FOREACH(e, c->environment_files)
4536 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4537
4538 STRV_FOREACH(e, c->pass_environment)
4539 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4540
4541 STRV_FOREACH(e, c->unset_environment)
4542 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4543
4544 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4545
4546 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4547 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4548
4549 STRV_FOREACH(d, c->directories[dt].paths)
4550 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4551 }
4552
4553 fprintf(f,
4554 "%sTimeoutCleanSec: %s\n",
4555 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4556
4557 if (c->nice_set)
4558 fprintf(f,
4559 "%sNice: %i\n",
4560 prefix, c->nice);
4561
4562 if (c->oom_score_adjust_set)
4563 fprintf(f,
4564 "%sOOMScoreAdjust: %i\n",
4565 prefix, c->oom_score_adjust);
4566
4567 for (i = 0; i < RLIM_NLIMITS; i++)
4568 if (c->rlimit[i]) {
4569 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4570 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4571 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4572 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4573 }
4574
4575 if (c->ioprio_set) {
4576 _cleanup_free_ char *class_str = NULL;
4577
4578 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4579 if (r >= 0)
4580 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4581
4582 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4583 }
4584
4585 if (c->cpu_sched_set) {
4586 _cleanup_free_ char *policy_str = NULL;
4587
4588 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4589 if (r >= 0)
4590 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4591
4592 fprintf(f,
4593 "%sCPUSchedulingPriority: %i\n"
4594 "%sCPUSchedulingResetOnFork: %s\n",
4595 prefix, c->cpu_sched_priority,
4596 prefix, yes_no(c->cpu_sched_reset_on_fork));
4597 }
4598
4599 if (c->cpu_set.set) {
4600 _cleanup_free_ char *affinity = NULL;
4601
4602 affinity = cpu_set_to_range_string(&c->cpu_set);
4603 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4604 }
4605
4606 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4607 _cleanup_free_ char *nodes = NULL;
4608
4609 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4610 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4611 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4612 }
4613
4614 if (c->timer_slack_nsec != NSEC_INFINITY)
4615 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4616
4617 fprintf(f,
4618 "%sStandardInput: %s\n"
4619 "%sStandardOutput: %s\n"
4620 "%sStandardError: %s\n",
4621 prefix, exec_input_to_string(c->std_input),
4622 prefix, exec_output_to_string(c->std_output),
4623 prefix, exec_output_to_string(c->std_error));
4624
4625 if (c->std_input == EXEC_INPUT_NAMED_FD)
4626 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4627 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4628 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4629 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4630 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4631
4632 if (c->std_input == EXEC_INPUT_FILE)
4633 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4634 if (c->std_output == EXEC_OUTPUT_FILE)
4635 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4636 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4637 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4638 if (c->std_error == EXEC_OUTPUT_FILE)
4639 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4640 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4641 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4642
4643 if (c->tty_path)
4644 fprintf(f,
4645 "%sTTYPath: %s\n"
4646 "%sTTYReset: %s\n"
4647 "%sTTYVHangup: %s\n"
4648 "%sTTYVTDisallocate: %s\n",
4649 prefix, c->tty_path,
4650 prefix, yes_no(c->tty_reset),
4651 prefix, yes_no(c->tty_vhangup),
4652 prefix, yes_no(c->tty_vt_disallocate));
4653
4654 if (IN_SET(c->std_output,
4655 EXEC_OUTPUT_SYSLOG,
4656 EXEC_OUTPUT_KMSG,
4657 EXEC_OUTPUT_JOURNAL,
4658 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4659 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4660 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4661 IN_SET(c->std_error,
4662 EXEC_OUTPUT_SYSLOG,
4663 EXEC_OUTPUT_KMSG,
4664 EXEC_OUTPUT_JOURNAL,
4665 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4666 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4667 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4668
4669 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4670
4671 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4672 if (r >= 0)
4673 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4674
4675 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4676 if (r >= 0)
4677 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4678 }
4679
4680 if (c->log_level_max >= 0) {
4681 _cleanup_free_ char *t = NULL;
4682
4683 (void) log_level_to_string_alloc(c->log_level_max, &t);
4684
4685 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4686 }
4687
4688 if (c->log_ratelimit_interval_usec > 0) {
4689 char buf_timespan[FORMAT_TIMESPAN_MAX];
4690
4691 fprintf(f,
4692 "%sLogRateLimitIntervalSec: %s\n",
4693 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4694 }
4695
4696 if (c->log_ratelimit_burst > 0)
4697 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4698
4699 if (c->n_log_extra_fields > 0) {
4700 size_t j;
4701
4702 for (j = 0; j < c->n_log_extra_fields; j++) {
4703 fprintf(f, "%sLogExtraFields: ", prefix);
4704 fwrite(c->log_extra_fields[j].iov_base,
4705 1, c->log_extra_fields[j].iov_len,
4706 f);
4707 fputc('\n', f);
4708 }
4709 }
4710
4711 if (c->log_namespace)
4712 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4713
4714 if (c->secure_bits) {
4715 _cleanup_free_ char *str = NULL;
4716
4717 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4718 if (r >= 0)
4719 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4720 }
4721
4722 if (c->capability_bounding_set != CAP_ALL) {
4723 _cleanup_free_ char *str = NULL;
4724
4725 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4726 if (r >= 0)
4727 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4728 }
4729
4730 if (c->capability_ambient_set != 0) {
4731 _cleanup_free_ char *str = NULL;
4732
4733 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4734 if (r >= 0)
4735 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4736 }
4737
4738 if (c->user)
4739 fprintf(f, "%sUser: %s\n", prefix, c->user);
4740 if (c->group)
4741 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4742
4743 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4744
4745 if (!strv_isempty(c->supplementary_groups)) {
4746 fprintf(f, "%sSupplementaryGroups:", prefix);
4747 strv_fprintf(f, c->supplementary_groups);
4748 fputs("\n", f);
4749 }
4750
4751 if (c->pam_name)
4752 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4753
4754 if (!strv_isempty(c->read_write_paths)) {
4755 fprintf(f, "%sReadWritePaths:", prefix);
4756 strv_fprintf(f, c->read_write_paths);
4757 fputs("\n", f);
4758 }
4759
4760 if (!strv_isempty(c->read_only_paths)) {
4761 fprintf(f, "%sReadOnlyPaths:", prefix);
4762 strv_fprintf(f, c->read_only_paths);
4763 fputs("\n", f);
4764 }
4765
4766 if (!strv_isempty(c->inaccessible_paths)) {
4767 fprintf(f, "%sInaccessiblePaths:", prefix);
4768 strv_fprintf(f, c->inaccessible_paths);
4769 fputs("\n", f);
4770 }
4771
4772 if (c->n_bind_mounts > 0)
4773 for (i = 0; i < c->n_bind_mounts; i++)
4774 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4775 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4776 c->bind_mounts[i].ignore_enoent ? "-": "",
4777 c->bind_mounts[i].source,
4778 c->bind_mounts[i].destination,
4779 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4780
4781 if (c->n_temporary_filesystems > 0)
4782 for (i = 0; i < c->n_temporary_filesystems; i++) {
4783 TemporaryFileSystem *t = c->temporary_filesystems + i;
4784
4785 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4786 t->path,
4787 isempty(t->options) ? "" : ":",
4788 strempty(t->options));
4789 }
4790
4791 if (c->utmp_id)
4792 fprintf(f,
4793 "%sUtmpIdentifier: %s\n",
4794 prefix, c->utmp_id);
4795
4796 if (c->selinux_context)
4797 fprintf(f,
4798 "%sSELinuxContext: %s%s\n",
4799 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4800
4801 if (c->apparmor_profile)
4802 fprintf(f,
4803 "%sAppArmorProfile: %s%s\n",
4804 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4805
4806 if (c->smack_process_label)
4807 fprintf(f,
4808 "%sSmackProcessLabel: %s%s\n",
4809 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4810
4811 if (c->personality != PERSONALITY_INVALID)
4812 fprintf(f,
4813 "%sPersonality: %s\n",
4814 prefix, strna(personality_to_string(c->personality)));
4815
4816 fprintf(f,
4817 "%sLockPersonality: %s\n",
4818 prefix, yes_no(c->lock_personality));
4819
4820 if (c->syscall_filter) {
4821 #if HAVE_SECCOMP
4822 Iterator j;
4823 void *id, *val;
4824 bool first = true;
4825 #endif
4826
4827 fprintf(f,
4828 "%sSystemCallFilter: ",
4829 prefix);
4830
4831 if (!c->syscall_whitelist)
4832 fputc('~', f);
4833
4834 #if HAVE_SECCOMP
4835 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4836 _cleanup_free_ char *name = NULL;
4837 const char *errno_name = NULL;
4838 int num = PTR_TO_INT(val);
4839
4840 if (first)
4841 first = false;
4842 else
4843 fputc(' ', f);
4844
4845 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4846 fputs(strna(name), f);
4847
4848 if (num >= 0) {
4849 errno_name = errno_to_name(num);
4850 if (errno_name)
4851 fprintf(f, ":%s", errno_name);
4852 else
4853 fprintf(f, ":%d", num);
4854 }
4855 }
4856 #endif
4857
4858 fputc('\n', f);
4859 }
4860
4861 if (c->syscall_archs) {
4862 #if HAVE_SECCOMP
4863 Iterator j;
4864 void *id;
4865 #endif
4866
4867 fprintf(f,
4868 "%sSystemCallArchitectures:",
4869 prefix);
4870
4871 #if HAVE_SECCOMP
4872 SET_FOREACH(id, c->syscall_archs, j)
4873 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4874 #endif
4875 fputc('\n', f);
4876 }
4877
4878 if (exec_context_restrict_namespaces_set(c)) {
4879 _cleanup_free_ char *s = NULL;
4880
4881 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4882 if (r >= 0)
4883 fprintf(f, "%sRestrictNamespaces: %s\n",
4884 prefix, s);
4885 }
4886
4887 if (c->network_namespace_path)
4888 fprintf(f,
4889 "%sNetworkNamespacePath: %s\n",
4890 prefix, c->network_namespace_path);
4891
4892 if (c->syscall_errno > 0) {
4893 const char *errno_name;
4894
4895 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4896
4897 errno_name = errno_to_name(c->syscall_errno);
4898 if (errno_name)
4899 fprintf(f, "%s\n", errno_name);
4900 else
4901 fprintf(f, "%d\n", c->syscall_errno);
4902 }
4903 }
4904
4905 bool exec_context_maintains_privileges(const ExecContext *c) {
4906 assert(c);
4907
4908 /* Returns true if the process forked off would run under
4909 * an unchanged UID or as root. */
4910
4911 if (!c->user)
4912 return true;
4913
4914 if (streq(c->user, "root") || streq(c->user, "0"))
4915 return true;
4916
4917 return false;
4918 }
4919
4920 int exec_context_get_effective_ioprio(const ExecContext *c) {
4921 int p;
4922
4923 assert(c);
4924
4925 if (c->ioprio_set)
4926 return c->ioprio;
4927
4928 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4929 if (p < 0)
4930 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4931
4932 return p;
4933 }
4934
4935 void exec_context_free_log_extra_fields(ExecContext *c) {
4936 size_t l;
4937
4938 assert(c);
4939
4940 for (l = 0; l < c->n_log_extra_fields; l++)
4941 free(c->log_extra_fields[l].iov_base);
4942 c->log_extra_fields = mfree(c->log_extra_fields);
4943 c->n_log_extra_fields = 0;
4944 }
4945
4946 void exec_context_revert_tty(ExecContext *c) {
4947 int r;
4948
4949 assert(c);
4950
4951 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4952 exec_context_tty_reset(c, NULL);
4953
4954 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4955 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4956 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4957
4958 if (exec_context_may_touch_tty(c)) {
4959 const char *path;
4960
4961 path = exec_context_tty_path(c);
4962 if (path) {
4963 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4964 if (r < 0 && r != -ENOENT)
4965 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4966 }
4967 }
4968 }
4969
4970 int exec_context_get_clean_directories(
4971 ExecContext *c,
4972 char **prefix,
4973 ExecCleanMask mask,
4974 char ***ret) {
4975
4976 _cleanup_strv_free_ char **l = NULL;
4977 ExecDirectoryType t;
4978 int r;
4979
4980 assert(c);
4981 assert(prefix);
4982 assert(ret);
4983
4984 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4985 char **i;
4986
4987 if (!FLAGS_SET(mask, 1U << t))
4988 continue;
4989
4990 if (!prefix[t])
4991 continue;
4992
4993 STRV_FOREACH(i, c->directories[t].paths) {
4994 char *j;
4995
4996 j = path_join(prefix[t], *i);
4997 if (!j)
4998 return -ENOMEM;
4999
5000 r = strv_consume(&l, j);
5001 if (r < 0)
5002 return r;
5003
5004 /* Also remove private directories unconditionally. */
5005 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5006 j = path_join(prefix[t], "private", *i);
5007 if (!j)
5008 return -ENOMEM;
5009
5010 r = strv_consume(&l, j);
5011 if (r < 0)
5012 return r;
5013 }
5014 }
5015 }
5016
5017 *ret = TAKE_PTR(l);
5018 return 0;
5019 }
5020
5021 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5022 ExecCleanMask mask = 0;
5023
5024 assert(c);
5025 assert(ret);
5026
5027 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5028 if (!strv_isempty(c->directories[t].paths))
5029 mask |= 1U << t;
5030
5031 *ret = mask;
5032 return 0;
5033 }
5034
5035 void exec_status_start(ExecStatus *s, pid_t pid) {
5036 assert(s);
5037
5038 *s = (ExecStatus) {
5039 .pid = pid,
5040 };
5041
5042 dual_timestamp_get(&s->start_timestamp);
5043 }
5044
5045 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5046 assert(s);
5047
5048 if (s->pid != pid) {
5049 *s = (ExecStatus) {
5050 .pid = pid,
5051 };
5052 }
5053
5054 dual_timestamp_get(&s->exit_timestamp);
5055
5056 s->code = code;
5057 s->status = status;
5058
5059 if (context && context->utmp_id)
5060 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5061 }
5062
5063 void exec_status_reset(ExecStatus *s) {
5064 assert(s);
5065
5066 *s = (ExecStatus) {};
5067 }
5068
5069 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5070 char buf[FORMAT_TIMESTAMP_MAX];
5071
5072 assert(s);
5073 assert(f);
5074
5075 if (s->pid <= 0)
5076 return;
5077
5078 prefix = strempty(prefix);
5079
5080 fprintf(f,
5081 "%sPID: "PID_FMT"\n",
5082 prefix, s->pid);
5083
5084 if (dual_timestamp_is_set(&s->start_timestamp))
5085 fprintf(f,
5086 "%sStart Timestamp: %s\n",
5087 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5088
5089 if (dual_timestamp_is_set(&s->exit_timestamp))
5090 fprintf(f,
5091 "%sExit Timestamp: %s\n"
5092 "%sExit Code: %s\n"
5093 "%sExit Status: %i\n",
5094 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5095 prefix, sigchld_code_to_string(s->code),
5096 prefix, s->status);
5097 }
5098
5099 static char *exec_command_line(char **argv) {
5100 size_t k;
5101 char *n, *p, **a;
5102 bool first = true;
5103
5104 assert(argv);
5105
5106 k = 1;
5107 STRV_FOREACH(a, argv)
5108 k += strlen(*a)+3;
5109
5110 n = new(char, k);
5111 if (!n)
5112 return NULL;
5113
5114 p = n;
5115 STRV_FOREACH(a, argv) {
5116
5117 if (!first)
5118 *(p++) = ' ';
5119 else
5120 first = false;
5121
5122 if (strpbrk(*a, WHITESPACE)) {
5123 *(p++) = '\'';
5124 p = stpcpy(p, *a);
5125 *(p++) = '\'';
5126 } else
5127 p = stpcpy(p, *a);
5128
5129 }
5130
5131 *p = 0;
5132
5133 /* FIXME: this doesn't really handle arguments that have
5134 * spaces and ticks in them */
5135
5136 return n;
5137 }
5138
5139 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5140 _cleanup_free_ char *cmd = NULL;
5141 const char *prefix2;
5142
5143 assert(c);
5144 assert(f);
5145
5146 prefix = strempty(prefix);
5147 prefix2 = strjoina(prefix, "\t");
5148
5149 cmd = exec_command_line(c->argv);
5150 fprintf(f,
5151 "%sCommand Line: %s\n",
5152 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5153
5154 exec_status_dump(&c->exec_status, f, prefix2);
5155 }
5156
5157 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5158 assert(f);
5159
5160 prefix = strempty(prefix);
5161
5162 LIST_FOREACH(command, c, c)
5163 exec_command_dump(c, f, prefix);
5164 }
5165
5166 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5167 ExecCommand *end;
5168
5169 assert(l);
5170 assert(e);
5171
5172 if (*l) {
5173 /* It's kind of important, that we keep the order here */
5174 LIST_FIND_TAIL(command, *l, end);
5175 LIST_INSERT_AFTER(command, *l, end, e);
5176 } else
5177 *l = e;
5178 }
5179
5180 int exec_command_set(ExecCommand *c, const char *path, ...) {
5181 va_list ap;
5182 char **l, *p;
5183
5184 assert(c);
5185 assert(path);
5186
5187 va_start(ap, path);
5188 l = strv_new_ap(path, ap);
5189 va_end(ap);
5190
5191 if (!l)
5192 return -ENOMEM;
5193
5194 p = strdup(path);
5195 if (!p) {
5196 strv_free(l);
5197 return -ENOMEM;
5198 }
5199
5200 free_and_replace(c->path, p);
5201
5202 return strv_free_and_replace(c->argv, l);
5203 }
5204
5205 int exec_command_append(ExecCommand *c, const char *path, ...) {
5206 _cleanup_strv_free_ char **l = NULL;
5207 va_list ap;
5208 int r;
5209
5210 assert(c);
5211 assert(path);
5212
5213 va_start(ap, path);
5214 l = strv_new_ap(path, ap);
5215 va_end(ap);
5216
5217 if (!l)
5218 return -ENOMEM;
5219
5220 r = strv_extend_strv(&c->argv, l, false);
5221 if (r < 0)
5222 return r;
5223
5224 return 0;
5225 }
5226
5227 static void *remove_tmpdir_thread(void *p) {
5228 _cleanup_free_ char *path = p;
5229
5230 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5231 return NULL;
5232 }
5233
5234 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5235 int r;
5236
5237 if (!rt)
5238 return NULL;
5239
5240 if (rt->manager)
5241 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5242
5243 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5244 if (destroy && rt->tmp_dir) {
5245 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5246
5247 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5248 if (r < 0) {
5249 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5250 free(rt->tmp_dir);
5251 }
5252
5253 rt->tmp_dir = NULL;
5254 }
5255
5256 if (destroy && rt->var_tmp_dir) {
5257 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5258
5259 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5260 if (r < 0) {
5261 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5262 free(rt->var_tmp_dir);
5263 }
5264
5265 rt->var_tmp_dir = NULL;
5266 }
5267
5268 rt->id = mfree(rt->id);
5269 rt->tmp_dir = mfree(rt->tmp_dir);
5270 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5271 safe_close_pair(rt->netns_storage_socket);
5272 return mfree(rt);
5273 }
5274
5275 static void exec_runtime_freep(ExecRuntime **rt) {
5276 (void) exec_runtime_free(*rt, false);
5277 }
5278
5279 static int exec_runtime_allocate(ExecRuntime **ret) {
5280 ExecRuntime *n;
5281
5282 assert(ret);
5283
5284 n = new(ExecRuntime, 1);
5285 if (!n)
5286 return -ENOMEM;
5287
5288 *n = (ExecRuntime) {
5289 .netns_storage_socket = { -1, -1 },
5290 };
5291
5292 *ret = n;
5293 return 0;
5294 }
5295
5296 static int exec_runtime_add(
5297 Manager *m,
5298 const char *id,
5299 const char *tmp_dir,
5300 const char *var_tmp_dir,
5301 const int netns_storage_socket[2],
5302 ExecRuntime **ret) {
5303
5304 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5305 int r;
5306
5307 assert(m);
5308 assert(id);
5309
5310 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5311 if (r < 0)
5312 return r;
5313
5314 r = exec_runtime_allocate(&rt);
5315 if (r < 0)
5316 return r;
5317
5318 rt->id = strdup(id);
5319 if (!rt->id)
5320 return -ENOMEM;
5321
5322 if (tmp_dir) {
5323 rt->tmp_dir = strdup(tmp_dir);
5324 if (!rt->tmp_dir)
5325 return -ENOMEM;
5326
5327 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5328 assert(var_tmp_dir);
5329 rt->var_tmp_dir = strdup(var_tmp_dir);
5330 if (!rt->var_tmp_dir)
5331 return -ENOMEM;
5332 }
5333
5334 if (netns_storage_socket) {
5335 rt->netns_storage_socket[0] = netns_storage_socket[0];
5336 rt->netns_storage_socket[1] = netns_storage_socket[1];
5337 }
5338
5339 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5340 if (r < 0)
5341 return r;
5342
5343 rt->manager = m;
5344
5345 if (ret)
5346 *ret = rt;
5347
5348 /* do not remove created ExecRuntime object when the operation succeeds. */
5349 rt = NULL;
5350 return 0;
5351 }
5352
5353 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5354 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5355 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5356 int r;
5357
5358 assert(m);
5359 assert(c);
5360 assert(id);
5361
5362 /* It is not necessary to create ExecRuntime object. */
5363 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5364 return 0;
5365
5366 if (c->private_tmp) {
5367 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5368 if (r < 0)
5369 return r;
5370 }
5371
5372 if (c->private_network || c->network_namespace_path) {
5373 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5374 return -errno;
5375 }
5376
5377 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5378 if (r < 0)
5379 return r;
5380
5381 /* Avoid cleanup */
5382 netns_storage_socket[0] = netns_storage_socket[1] = -1;
5383 return 1;
5384 }
5385
5386 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5387 ExecRuntime *rt;
5388 int r;
5389
5390 assert(m);
5391 assert(id);
5392 assert(ret);
5393
5394 rt = hashmap_get(m->exec_runtime_by_id, id);
5395 if (rt)
5396 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5397 goto ref;
5398
5399 if (!create)
5400 return 0;
5401
5402 /* If not found, then create a new object. */
5403 r = exec_runtime_make(m, c, id, &rt);
5404 if (r <= 0)
5405 /* When r == 0, it is not necessary to create ExecRuntime object. */
5406 return r;
5407
5408 ref:
5409 /* increment reference counter. */
5410 rt->n_ref++;
5411 *ret = rt;
5412 return 1;
5413 }
5414
5415 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5416 if (!rt)
5417 return NULL;
5418
5419 assert(rt->n_ref > 0);
5420
5421 rt->n_ref--;
5422 if (rt->n_ref > 0)
5423 return NULL;
5424
5425 return exec_runtime_free(rt, destroy);
5426 }
5427
5428 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5429 ExecRuntime *rt;
5430 Iterator i;
5431
5432 assert(m);
5433 assert(f);
5434 assert(fds);
5435
5436 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5437 fprintf(f, "exec-runtime=%s", rt->id);
5438
5439 if (rt->tmp_dir)
5440 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5441
5442 if (rt->var_tmp_dir)
5443 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5444
5445 if (rt->netns_storage_socket[0] >= 0) {
5446 int copy;
5447
5448 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5449 if (copy < 0)
5450 return copy;
5451
5452 fprintf(f, " netns-socket-0=%i", copy);
5453 }
5454
5455 if (rt->netns_storage_socket[1] >= 0) {
5456 int copy;
5457
5458 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5459 if (copy < 0)
5460 return copy;
5461
5462 fprintf(f, " netns-socket-1=%i", copy);
5463 }
5464
5465 fputc('\n', f);
5466 }
5467
5468 return 0;
5469 }
5470
5471 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5472 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5473 ExecRuntime *rt;
5474 int r;
5475
5476 /* This is for the migration from old (v237 or earlier) deserialization text.
5477 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5478 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5479 * so or not from the serialized text, then we always creates a new object owned by this. */
5480
5481 assert(u);
5482 assert(key);
5483 assert(value);
5484
5485 /* Manager manages ExecRuntime objects by the unit id.
5486 * So, we omit the serialized text when the unit does not have id (yet?)... */
5487 if (isempty(u->id)) {
5488 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5489 return 0;
5490 }
5491
5492 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5493 if (r < 0) {
5494 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5495 return 0;
5496 }
5497
5498 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5499 if (!rt) {
5500 r = exec_runtime_allocate(&rt_create);
5501 if (r < 0)
5502 return log_oom();
5503
5504 rt_create->id = strdup(u->id);
5505 if (!rt_create->id)
5506 return log_oom();
5507
5508 rt = rt_create;
5509 }
5510
5511 if (streq(key, "tmp-dir")) {
5512 char *copy;
5513
5514 copy = strdup(value);
5515 if (!copy)
5516 return log_oom();
5517
5518 free_and_replace(rt->tmp_dir, copy);
5519
5520 } else if (streq(key, "var-tmp-dir")) {
5521 char *copy;
5522
5523 copy = strdup(value);
5524 if (!copy)
5525 return log_oom();
5526
5527 free_and_replace(rt->var_tmp_dir, copy);
5528
5529 } else if (streq(key, "netns-socket-0")) {
5530 int fd;
5531
5532 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5533 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5534 return 0;
5535 }
5536
5537 safe_close(rt->netns_storage_socket[0]);
5538 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5539
5540 } else if (streq(key, "netns-socket-1")) {
5541 int fd;
5542
5543 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5544 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5545 return 0;
5546 }
5547
5548 safe_close(rt->netns_storage_socket[1]);
5549 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5550 } else
5551 return 0;
5552
5553 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5554 if (rt_create) {
5555 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5556 if (r < 0) {
5557 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5558 return 0;
5559 }
5560
5561 rt_create->manager = u->manager;
5562
5563 /* Avoid cleanup */
5564 rt_create = NULL;
5565 }
5566
5567 return 1;
5568 }
5569
5570 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5571 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5572 int r, fd0 = -1, fd1 = -1;
5573 const char *p, *v = value;
5574 size_t n;
5575
5576 assert(m);
5577 assert(value);
5578 assert(fds);
5579
5580 n = strcspn(v, " ");
5581 id = strndupa(v, n);
5582 if (v[n] != ' ')
5583 goto finalize;
5584 p = v + n + 1;
5585
5586 v = startswith(p, "tmp-dir=");
5587 if (v) {
5588 n = strcspn(v, " ");
5589 tmp_dir = strndupa(v, n);
5590 if (v[n] != ' ')
5591 goto finalize;
5592 p = v + n + 1;
5593 }
5594
5595 v = startswith(p, "var-tmp-dir=");
5596 if (v) {
5597 n = strcspn(v, " ");
5598 var_tmp_dir = strndupa(v, n);
5599 if (v[n] != ' ')
5600 goto finalize;
5601 p = v + n + 1;
5602 }
5603
5604 v = startswith(p, "netns-socket-0=");
5605 if (v) {
5606 char *buf;
5607
5608 n = strcspn(v, " ");
5609 buf = strndupa(v, n);
5610 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5611 log_debug("Unable to process exec-runtime netns fd specification.");
5612 return;
5613 }
5614 fd0 = fdset_remove(fds, fd0);
5615 if (v[n] != ' ')
5616 goto finalize;
5617 p = v + n + 1;
5618 }
5619
5620 v = startswith(p, "netns-socket-1=");
5621 if (v) {
5622 char *buf;
5623
5624 n = strcspn(v, " ");
5625 buf = strndupa(v, n);
5626 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5627 log_debug("Unable to process exec-runtime netns fd specification.");
5628 return;
5629 }
5630 fd1 = fdset_remove(fds, fd1);
5631 }
5632
5633 finalize:
5634
5635 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5636 if (r < 0)
5637 log_debug_errno(r, "Failed to add exec-runtime: %m");
5638 }
5639
5640 void exec_runtime_vacuum(Manager *m) {
5641 ExecRuntime *rt;
5642 Iterator i;
5643
5644 assert(m);
5645
5646 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5647
5648 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5649 if (rt->n_ref > 0)
5650 continue;
5651
5652 (void) exec_runtime_free(rt, false);
5653 }
5654 }
5655
5656 void exec_params_clear(ExecParameters *p) {
5657 if (!p)
5658 return;
5659
5660 strv_free(p->environment);
5661 }
5662
5663 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5664 [EXEC_INPUT_NULL] = "null",
5665 [EXEC_INPUT_TTY] = "tty",
5666 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5667 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5668 [EXEC_INPUT_SOCKET] = "socket",
5669 [EXEC_INPUT_NAMED_FD] = "fd",
5670 [EXEC_INPUT_DATA] = "data",
5671 [EXEC_INPUT_FILE] = "file",
5672 };
5673
5674 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5675
5676 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5677 [EXEC_OUTPUT_INHERIT] = "inherit",
5678 [EXEC_OUTPUT_NULL] = "null",
5679 [EXEC_OUTPUT_TTY] = "tty",
5680 [EXEC_OUTPUT_SYSLOG] = "syslog",
5681 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5682 [EXEC_OUTPUT_KMSG] = "kmsg",
5683 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5684 [EXEC_OUTPUT_JOURNAL] = "journal",
5685 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5686 [EXEC_OUTPUT_SOCKET] = "socket",
5687 [EXEC_OUTPUT_NAMED_FD] = "fd",
5688 [EXEC_OUTPUT_FILE] = "file",
5689 [EXEC_OUTPUT_FILE_APPEND] = "append",
5690 };
5691
5692 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5693
5694 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5695 [EXEC_UTMP_INIT] = "init",
5696 [EXEC_UTMP_LOGIN] = "login",
5697 [EXEC_UTMP_USER] = "user",
5698 };
5699
5700 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5701
5702 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5703 [EXEC_PRESERVE_NO] = "no",
5704 [EXEC_PRESERVE_YES] = "yes",
5705 [EXEC_PRESERVE_RESTART] = "restart",
5706 };
5707
5708 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5709
5710 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5711 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5712 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5713 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5714 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5715 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5716 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5717 };
5718
5719 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5720
5721 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5722 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5723 * directories, specifically .timer units with their timestamp touch file. */
5724 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5725 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5726 [EXEC_DIRECTORY_STATE] = "state",
5727 [EXEC_DIRECTORY_CACHE] = "cache",
5728 [EXEC_DIRECTORY_LOGS] = "logs",
5729 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5730 };
5731
5732 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5733
5734 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5735 * the service payload in. */
5736 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5737 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5738 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5739 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5740 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5741 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5742 };
5743
5744 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5745
5746 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5747 [EXEC_KEYRING_INHERIT] = "inherit",
5748 [EXEC_KEYRING_PRIVATE] = "private",
5749 [EXEC_KEYRING_SHARED] = "shared",
5750 };
5751
5752 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);