]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
21127d4f70262e692400f7734154f760e2f686dd
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <glob.h>
6 #include <grp.h>
7 #include <poll.h>
8 #include <signal.h>
9 #include <string.h>
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
12 #include <sys/mman.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/shm.h>
16 #include <sys/socket.h>
17 #include <sys/stat.h>
18 #include <sys/types.h>
19 #include <sys/un.h>
20 #include <unistd.h>
21 #include <utmpx.h>
22
23 #if HAVE_PAM
24 #include <security/pam_appl.h>
25 #endif
26
27 #if HAVE_SELINUX
28 #include <selinux/selinux.h>
29 #endif
30
31 #if HAVE_SECCOMP
32 #include <seccomp.h>
33 #endif
34
35 #if HAVE_APPARMOR
36 #include <sys/apparmor.h>
37 #endif
38
39 #include "sd-messages.h"
40
41 #include "af-list.h"
42 #include "alloc-util.h"
43 #if HAVE_APPARMOR
44 #include "apparmor-util.h"
45 #endif
46 #include "async.h"
47 #include "barrier.h"
48 #include "cap-list.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cpu-set-util.h"
52 #include "def.h"
53 #include "env-file.h"
54 #include "env-util.h"
55 #include "errno-list.h"
56 #include "execute.h"
57 #include "exit-status.h"
58 #include "fd-util.h"
59 #include "format-util.h"
60 #include "fs-util.h"
61 #include "glob-util.h"
62 #include "io-util.h"
63 #include "ioprio.h"
64 #include "label.h"
65 #include "log.h"
66 #include "macro.h"
67 #include "manager.h"
68 #include "memory-util.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "namespace.h"
72 #include "parse-util.h"
73 #include "path-util.h"
74 #include "process-util.h"
75 #include "rlimit-util.h"
76 #include "rm-rf.h"
77 #if HAVE_SECCOMP
78 #include "seccomp-util.h"
79 #endif
80 #include "securebits-util.h"
81 #include "selinux-util.h"
82 #include "signal-util.h"
83 #include "smack-util.h"
84 #include "socket-util.h"
85 #include "special.h"
86 #include "stat-util.h"
87 #include "string-table.h"
88 #include "string-util.h"
89 #include "strv.h"
90 #include "syslog-util.h"
91 #include "terminal-util.h"
92 #include "umask-util.h"
93 #include "unit.h"
94 #include "user-util.h"
95 #include "utmp-wtmp.h"
96
97 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
98 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
99
100 #define SNDBUF_SIZE (8*1024*1024)
101
102 static int shift_fds(int fds[], size_t n_fds) {
103 int start, restart_from;
104
105 if (n_fds <= 0)
106 return 0;
107
108 /* Modifies the fds array! (sorts it) */
109
110 assert(fds);
111
112 start = 0;
113 for (;;) {
114 int i;
115
116 restart_from = -1;
117
118 for (i = start; i < (int) n_fds; i++) {
119 int nfd;
120
121 /* Already at right index? */
122 if (fds[i] == i+3)
123 continue;
124
125 nfd = fcntl(fds[i], F_DUPFD, i + 3);
126 if (nfd < 0)
127 return -errno;
128
129 safe_close(fds[i]);
130 fds[i] = nfd;
131
132 /* Hmm, the fd we wanted isn't free? Then
133 * let's remember that and try again from here */
134 if (nfd != i+3 && restart_from < 0)
135 restart_from = i;
136 }
137
138 if (restart_from < 0)
139 break;
140
141 start = restart_from;
142 }
143
144 return 0;
145 }
146
147 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
148 size_t i, n_fds;
149 int r;
150
151 n_fds = n_socket_fds + n_storage_fds;
152 if (n_fds <= 0)
153 return 0;
154
155 assert(fds);
156
157 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
158 * O_NONBLOCK only applies to socket activation though. */
159
160 for (i = 0; i < n_fds; i++) {
161
162 if (i < n_socket_fds) {
163 r = fd_nonblock(fds[i], nonblock);
164 if (r < 0)
165 return r;
166 }
167
168 /* We unconditionally drop FD_CLOEXEC from the fds,
169 * since after all we want to pass these fds to our
170 * children */
171
172 r = fd_cloexec(fds[i], false);
173 if (r < 0)
174 return r;
175 }
176
177 return 0;
178 }
179
180 static const char *exec_context_tty_path(const ExecContext *context) {
181 assert(context);
182
183 if (context->stdio_as_fds)
184 return NULL;
185
186 if (context->tty_path)
187 return context->tty_path;
188
189 return "/dev/console";
190 }
191
192 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
193 const char *path;
194
195 assert(context);
196
197 path = exec_context_tty_path(context);
198
199 if (context->tty_vhangup) {
200 if (p && p->stdin_fd >= 0)
201 (void) terminal_vhangup_fd(p->stdin_fd);
202 else if (path)
203 (void) terminal_vhangup(path);
204 }
205
206 if (context->tty_reset) {
207 if (p && p->stdin_fd >= 0)
208 (void) reset_terminal_fd(p->stdin_fd, true);
209 else if (path)
210 (void) reset_terminal(path);
211 }
212
213 if (context->tty_vt_disallocate && path)
214 (void) vt_disallocate(path);
215 }
216
217 static bool is_terminal_input(ExecInput i) {
218 return IN_SET(i,
219 EXEC_INPUT_TTY,
220 EXEC_INPUT_TTY_FORCE,
221 EXEC_INPUT_TTY_FAIL);
222 }
223
224 static bool is_terminal_output(ExecOutput o) {
225 return IN_SET(o,
226 EXEC_OUTPUT_TTY,
227 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
228 EXEC_OUTPUT_KMSG_AND_CONSOLE,
229 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
230 }
231
232 static bool is_syslog_output(ExecOutput o) {
233 return IN_SET(o,
234 EXEC_OUTPUT_SYSLOG,
235 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
236 }
237
238 static bool is_kmsg_output(ExecOutput o) {
239 return IN_SET(o,
240 EXEC_OUTPUT_KMSG,
241 EXEC_OUTPUT_KMSG_AND_CONSOLE);
242 }
243
244 static bool exec_context_needs_term(const ExecContext *c) {
245 assert(c);
246
247 /* Return true if the execution context suggests we should set $TERM to something useful. */
248
249 if (is_terminal_input(c->std_input))
250 return true;
251
252 if (is_terminal_output(c->std_output))
253 return true;
254
255 if (is_terminal_output(c->std_error))
256 return true;
257
258 return !!c->tty_path;
259 }
260
261 static int open_null_as(int flags, int nfd) {
262 int fd;
263
264 assert(nfd >= 0);
265
266 fd = open("/dev/null", flags|O_NOCTTY);
267 if (fd < 0)
268 return -errno;
269
270 return move_fd(fd, nfd, false);
271 }
272
273 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
274 static const union sockaddr_union sa = {
275 .un.sun_family = AF_UNIX,
276 .un.sun_path = "/run/systemd/journal/stdout",
277 };
278 uid_t olduid = UID_INVALID;
279 gid_t oldgid = GID_INVALID;
280 int r;
281
282 if (gid_is_valid(gid)) {
283 oldgid = getgid();
284
285 if (setegid(gid) < 0)
286 return -errno;
287 }
288
289 if (uid_is_valid(uid)) {
290 olduid = getuid();
291
292 if (seteuid(uid) < 0) {
293 r = -errno;
294 goto restore_gid;
295 }
296 }
297
298 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
299
300 /* If we fail to restore the uid or gid, things will likely
301 fail later on. This should only happen if an LSM interferes. */
302
303 if (uid_is_valid(uid))
304 (void) seteuid(olduid);
305
306 restore_gid:
307 if (gid_is_valid(gid))
308 (void) setegid(oldgid);
309
310 return r;
311 }
312
313 static int connect_logger_as(
314 const Unit *unit,
315 const ExecContext *context,
316 const ExecParameters *params,
317 ExecOutput output,
318 const char *ident,
319 int nfd,
320 uid_t uid,
321 gid_t gid) {
322
323 _cleanup_close_ int fd = -1;
324 int r;
325
326 assert(context);
327 assert(params);
328 assert(output < _EXEC_OUTPUT_MAX);
329 assert(ident);
330 assert(nfd >= 0);
331
332 fd = socket(AF_UNIX, SOCK_STREAM, 0);
333 if (fd < 0)
334 return -errno;
335
336 r = connect_journal_socket(fd, uid, gid);
337 if (r < 0)
338 return r;
339
340 if (shutdown(fd, SHUT_RD) < 0)
341 return -errno;
342
343 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
344
345 if (dprintf(fd,
346 "%s\n"
347 "%s\n"
348 "%i\n"
349 "%i\n"
350 "%i\n"
351 "%i\n"
352 "%i\n",
353 context->syslog_identifier ?: ident,
354 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
355 context->syslog_priority,
356 !!context->syslog_level_prefix,
357 is_syslog_output(output),
358 is_kmsg_output(output),
359 is_terminal_output(output)) < 0)
360 return -errno;
361
362 return move_fd(TAKE_FD(fd), nfd, false);
363 }
364
365 static int open_terminal_as(const char *path, int flags, int nfd) {
366 int fd;
367
368 assert(path);
369 assert(nfd >= 0);
370
371 fd = open_terminal(path, flags | O_NOCTTY);
372 if (fd < 0)
373 return fd;
374
375 return move_fd(fd, nfd, false);
376 }
377
378 static int acquire_path(const char *path, int flags, mode_t mode) {
379 union sockaddr_union sa = {};
380 _cleanup_close_ int fd = -1;
381 int r, salen;
382
383 assert(path);
384
385 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
386 flags |= O_CREAT;
387
388 fd = open(path, flags|O_NOCTTY, mode);
389 if (fd >= 0)
390 return TAKE_FD(fd);
391
392 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
393 return -errno;
394 if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
395 return -ENXIO;
396
397 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
398
399 fd = socket(AF_UNIX, SOCK_STREAM, 0);
400 if (fd < 0)
401 return -errno;
402
403 salen = sockaddr_un_set_path(&sa.un, path);
404 if (salen < 0)
405 return salen;
406
407 if (connect(fd, &sa.sa, salen) < 0)
408 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
409 * indication that his wasn't an AF_UNIX socket after all */
410
411 if ((flags & O_ACCMODE) == O_RDONLY)
412 r = shutdown(fd, SHUT_WR);
413 else if ((flags & O_ACCMODE) == O_WRONLY)
414 r = shutdown(fd, SHUT_RD);
415 else
416 return TAKE_FD(fd);
417 if (r < 0)
418 return -errno;
419
420 return TAKE_FD(fd);
421 }
422
423 static int fixup_input(
424 const ExecContext *context,
425 int socket_fd,
426 bool apply_tty_stdin) {
427
428 ExecInput std_input;
429
430 assert(context);
431
432 std_input = context->std_input;
433
434 if (is_terminal_input(std_input) && !apply_tty_stdin)
435 return EXEC_INPUT_NULL;
436
437 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
438 return EXEC_INPUT_NULL;
439
440 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
441 return EXEC_INPUT_NULL;
442
443 return std_input;
444 }
445
446 static int fixup_output(ExecOutput std_output, int socket_fd) {
447
448 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
449 return EXEC_OUTPUT_INHERIT;
450
451 return std_output;
452 }
453
454 static int setup_input(
455 const ExecContext *context,
456 const ExecParameters *params,
457 int socket_fd,
458 const int named_iofds[static 3]) {
459
460 ExecInput i;
461
462 assert(context);
463 assert(params);
464 assert(named_iofds);
465
466 if (params->stdin_fd >= 0) {
467 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
468 return -errno;
469
470 /* Try to make this the controlling tty, if it is a tty, and reset it */
471 if (isatty(STDIN_FILENO)) {
472 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
473 (void) reset_terminal_fd(STDIN_FILENO, true);
474 }
475
476 return STDIN_FILENO;
477 }
478
479 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
480
481 switch (i) {
482
483 case EXEC_INPUT_NULL:
484 return open_null_as(O_RDONLY, STDIN_FILENO);
485
486 case EXEC_INPUT_TTY:
487 case EXEC_INPUT_TTY_FORCE:
488 case EXEC_INPUT_TTY_FAIL: {
489 int fd;
490
491 fd = acquire_terminal(exec_context_tty_path(context),
492 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
493 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
494 ACQUIRE_TERMINAL_WAIT,
495 USEC_INFINITY);
496 if (fd < 0)
497 return fd;
498
499 return move_fd(fd, STDIN_FILENO, false);
500 }
501
502 case EXEC_INPUT_SOCKET:
503 assert(socket_fd >= 0);
504
505 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
506
507 case EXEC_INPUT_NAMED_FD:
508 assert(named_iofds[STDIN_FILENO] >= 0);
509
510 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
511 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
512
513 case EXEC_INPUT_DATA: {
514 int fd;
515
516 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
517 if (fd < 0)
518 return fd;
519
520 return move_fd(fd, STDIN_FILENO, false);
521 }
522
523 case EXEC_INPUT_FILE: {
524 bool rw;
525 int fd;
526
527 assert(context->stdio_file[STDIN_FILENO]);
528
529 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
530 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
531
532 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
533 if (fd < 0)
534 return fd;
535
536 return move_fd(fd, STDIN_FILENO, false);
537 }
538
539 default:
540 assert_not_reached("Unknown input type");
541 }
542 }
543
544 static bool can_inherit_stderr_from_stdout(
545 const ExecContext *context,
546 ExecOutput o,
547 ExecOutput e) {
548
549 assert(context);
550
551 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
552 * stderr fd */
553
554 if (e == EXEC_OUTPUT_INHERIT)
555 return true;
556 if (e != o)
557 return false;
558
559 if (e == EXEC_OUTPUT_NAMED_FD)
560 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
561
562 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
563 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
564
565 return true;
566 }
567
568 static int setup_output(
569 const Unit *unit,
570 const ExecContext *context,
571 const ExecParameters *params,
572 int fileno,
573 int socket_fd,
574 const int named_iofds[static 3],
575 const char *ident,
576 uid_t uid,
577 gid_t gid,
578 dev_t *journal_stream_dev,
579 ino_t *journal_stream_ino) {
580
581 ExecOutput o;
582 ExecInput i;
583 int r;
584
585 assert(unit);
586 assert(context);
587 assert(params);
588 assert(ident);
589 assert(journal_stream_dev);
590 assert(journal_stream_ino);
591
592 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
593
594 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
595 return -errno;
596
597 return STDOUT_FILENO;
598 }
599
600 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
601 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
602 return -errno;
603
604 return STDERR_FILENO;
605 }
606
607 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
608 o = fixup_output(context->std_output, socket_fd);
609
610 if (fileno == STDERR_FILENO) {
611 ExecOutput e;
612 e = fixup_output(context->std_error, socket_fd);
613
614 /* This expects the input and output are already set up */
615
616 /* Don't change the stderr file descriptor if we inherit all
617 * the way and are not on a tty */
618 if (e == EXEC_OUTPUT_INHERIT &&
619 o == EXEC_OUTPUT_INHERIT &&
620 i == EXEC_INPUT_NULL &&
621 !is_terminal_input(context->std_input) &&
622 getppid () != 1)
623 return fileno;
624
625 /* Duplicate from stdout if possible */
626 if (can_inherit_stderr_from_stdout(context, o, e))
627 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
628
629 o = e;
630
631 } else if (o == EXEC_OUTPUT_INHERIT) {
632 /* If input got downgraded, inherit the original value */
633 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
634 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
635
636 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
637 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
638 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
639
640 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
641 if (getppid() != 1)
642 return fileno;
643
644 /* We need to open /dev/null here anew, to get the right access mode. */
645 return open_null_as(O_WRONLY, fileno);
646 }
647
648 switch (o) {
649
650 case EXEC_OUTPUT_NULL:
651 return open_null_as(O_WRONLY, fileno);
652
653 case EXEC_OUTPUT_TTY:
654 if (is_terminal_input(i))
655 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
656
657 /* We don't reset the terminal if this is just about output */
658 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
659
660 case EXEC_OUTPUT_SYSLOG:
661 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
662 case EXEC_OUTPUT_KMSG:
663 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
664 case EXEC_OUTPUT_JOURNAL:
665 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
666 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
667 if (r < 0) {
668 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
669 r = open_null_as(O_WRONLY, fileno);
670 } else {
671 struct stat st;
672
673 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
674 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
675 * services to detect whether they are connected to the journal or not.
676 *
677 * If both stdout and stderr are connected to a stream then let's make sure to store the data
678 * about STDERR as that's usually the best way to do logging. */
679
680 if (fstat(fileno, &st) >= 0 &&
681 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
682 *journal_stream_dev = st.st_dev;
683 *journal_stream_ino = st.st_ino;
684 }
685 }
686 return r;
687
688 case EXEC_OUTPUT_SOCKET:
689 assert(socket_fd >= 0);
690
691 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
692
693 case EXEC_OUTPUT_NAMED_FD:
694 assert(named_iofds[fileno] >= 0);
695
696 (void) fd_nonblock(named_iofds[fileno], false);
697 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
698
699 case EXEC_OUTPUT_FILE:
700 case EXEC_OUTPUT_FILE_APPEND: {
701 bool rw;
702 int fd, flags;
703
704 assert(context->stdio_file[fileno]);
705
706 rw = context->std_input == EXEC_INPUT_FILE &&
707 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
708
709 if (rw)
710 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
711
712 flags = O_WRONLY;
713 if (o == EXEC_OUTPUT_FILE_APPEND)
714 flags |= O_APPEND;
715
716 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
717 if (fd < 0)
718 return fd;
719
720 return move_fd(fd, fileno, 0);
721 }
722
723 default:
724 assert_not_reached("Unknown error type");
725 }
726 }
727
728 static int chown_terminal(int fd, uid_t uid) {
729 int r;
730
731 assert(fd >= 0);
732
733 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
734 if (isatty(fd) < 1) {
735 if (IN_SET(errno, EINVAL, ENOTTY))
736 return 0; /* not a tty */
737
738 return -errno;
739 }
740
741 /* This might fail. What matters are the results. */
742 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
743 if (r < 0)
744 return r;
745
746 return 1;
747 }
748
749 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
750 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
751 int r;
752
753 assert(_saved_stdin);
754 assert(_saved_stdout);
755
756 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
757 if (saved_stdin < 0)
758 return -errno;
759
760 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
761 if (saved_stdout < 0)
762 return -errno;
763
764 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
765 if (fd < 0)
766 return fd;
767
768 r = chown_terminal(fd, getuid());
769 if (r < 0)
770 return r;
771
772 r = reset_terminal_fd(fd, true);
773 if (r < 0)
774 return r;
775
776 r = rearrange_stdio(fd, fd, STDERR_FILENO);
777 fd = -1;
778 if (r < 0)
779 return r;
780
781 *_saved_stdin = saved_stdin;
782 *_saved_stdout = saved_stdout;
783
784 saved_stdin = saved_stdout = -1;
785
786 return 0;
787 }
788
789 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
790 assert(err < 0);
791
792 if (err == -ETIMEDOUT)
793 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
794 else {
795 errno = -err;
796 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
797 }
798 }
799
800 static void write_confirm_error(int err, const char *vc, const Unit *u) {
801 _cleanup_close_ int fd = -1;
802
803 assert(vc);
804
805 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
806 if (fd < 0)
807 return;
808
809 write_confirm_error_fd(err, fd, u);
810 }
811
812 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
813 int r = 0;
814
815 assert(saved_stdin);
816 assert(saved_stdout);
817
818 release_terminal();
819
820 if (*saved_stdin >= 0)
821 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
822 r = -errno;
823
824 if (*saved_stdout >= 0)
825 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
826 r = -errno;
827
828 *saved_stdin = safe_close(*saved_stdin);
829 *saved_stdout = safe_close(*saved_stdout);
830
831 return r;
832 }
833
834 enum {
835 CONFIRM_PRETEND_FAILURE = -1,
836 CONFIRM_PRETEND_SUCCESS = 0,
837 CONFIRM_EXECUTE = 1,
838 };
839
840 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
841 int saved_stdout = -1, saved_stdin = -1, r;
842 _cleanup_free_ char *e = NULL;
843 char c;
844
845 /* For any internal errors, assume a positive response. */
846 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
847 if (r < 0) {
848 write_confirm_error(r, vc, u);
849 return CONFIRM_EXECUTE;
850 }
851
852 /* confirm_spawn might have been disabled while we were sleeping. */
853 if (manager_is_confirm_spawn_disabled(u->manager)) {
854 r = 1;
855 goto restore_stdio;
856 }
857
858 e = ellipsize(cmdline, 60, 100);
859 if (!e) {
860 log_oom();
861 r = CONFIRM_EXECUTE;
862 goto restore_stdio;
863 }
864
865 for (;;) {
866 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
867 if (r < 0) {
868 write_confirm_error_fd(r, STDOUT_FILENO, u);
869 r = CONFIRM_EXECUTE;
870 goto restore_stdio;
871 }
872
873 switch (c) {
874 case 'c':
875 printf("Resuming normal execution.\n");
876 manager_disable_confirm_spawn();
877 r = 1;
878 break;
879 case 'D':
880 unit_dump(u, stdout, " ");
881 continue; /* ask again */
882 case 'f':
883 printf("Failing execution.\n");
884 r = CONFIRM_PRETEND_FAILURE;
885 break;
886 case 'h':
887 printf(" c - continue, proceed without asking anymore\n"
888 " D - dump, show the state of the unit\n"
889 " f - fail, don't execute the command and pretend it failed\n"
890 " h - help\n"
891 " i - info, show a short summary of the unit\n"
892 " j - jobs, show jobs that are in progress\n"
893 " s - skip, don't execute the command and pretend it succeeded\n"
894 " y - yes, execute the command\n");
895 continue; /* ask again */
896 case 'i':
897 printf(" Description: %s\n"
898 " Unit: %s\n"
899 " Command: %s\n",
900 u->id, u->description, cmdline);
901 continue; /* ask again */
902 case 'j':
903 manager_dump_jobs(u->manager, stdout, " ");
904 continue; /* ask again */
905 case 'n':
906 /* 'n' was removed in favor of 'f'. */
907 printf("Didn't understand 'n', did you mean 'f'?\n");
908 continue; /* ask again */
909 case 's':
910 printf("Skipping execution.\n");
911 r = CONFIRM_PRETEND_SUCCESS;
912 break;
913 case 'y':
914 r = CONFIRM_EXECUTE;
915 break;
916 default:
917 assert_not_reached("Unhandled choice");
918 }
919 break;
920 }
921
922 restore_stdio:
923 restore_confirm_stdio(&saved_stdin, &saved_stdout);
924 return r;
925 }
926
927 static int get_fixed_user(const ExecContext *c, const char **user,
928 uid_t *uid, gid_t *gid,
929 const char **home, const char **shell) {
930 int r;
931 const char *name;
932
933 assert(c);
934
935 if (!c->user)
936 return 0;
937
938 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
939 * (i.e. are "/" or "/bin/nologin"). */
940
941 name = c->user;
942 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
943 if (r < 0)
944 return r;
945
946 *user = name;
947 return 0;
948 }
949
950 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
951 int r;
952 const char *name;
953
954 assert(c);
955
956 if (!c->group)
957 return 0;
958
959 name = c->group;
960 r = get_group_creds(&name, gid, 0);
961 if (r < 0)
962 return r;
963
964 *group = name;
965 return 0;
966 }
967
968 static int get_supplementary_groups(const ExecContext *c, const char *user,
969 const char *group, gid_t gid,
970 gid_t **supplementary_gids, int *ngids) {
971 char **i;
972 int r, k = 0;
973 int ngroups_max;
974 bool keep_groups = false;
975 gid_t *groups = NULL;
976 _cleanup_free_ gid_t *l_gids = NULL;
977
978 assert(c);
979
980 /*
981 * If user is given, then lookup GID and supplementary groups list.
982 * We avoid NSS lookups for gid=0. Also we have to initialize groups
983 * here and as early as possible so we keep the list of supplementary
984 * groups of the caller.
985 */
986 if (user && gid_is_valid(gid) && gid != 0) {
987 /* First step, initialize groups from /etc/groups */
988 if (initgroups(user, gid) < 0)
989 return -errno;
990
991 keep_groups = true;
992 }
993
994 if (strv_isempty(c->supplementary_groups))
995 return 0;
996
997 /*
998 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
999 * be positive, otherwise fail.
1000 */
1001 errno = 0;
1002 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1003 if (ngroups_max <= 0)
1004 return errno_or_else(EOPNOTSUPP);
1005
1006 l_gids = new(gid_t, ngroups_max);
1007 if (!l_gids)
1008 return -ENOMEM;
1009
1010 if (keep_groups) {
1011 /*
1012 * Lookup the list of groups that the user belongs to, we
1013 * avoid NSS lookups here too for gid=0.
1014 */
1015 k = ngroups_max;
1016 if (getgrouplist(user, gid, l_gids, &k) < 0)
1017 return -EINVAL;
1018 } else
1019 k = 0;
1020
1021 STRV_FOREACH(i, c->supplementary_groups) {
1022 const char *g;
1023
1024 if (k >= ngroups_max)
1025 return -E2BIG;
1026
1027 g = *i;
1028 r = get_group_creds(&g, l_gids+k, 0);
1029 if (r < 0)
1030 return r;
1031
1032 k++;
1033 }
1034
1035 /*
1036 * Sets ngids to zero to drop all supplementary groups, happens
1037 * when we are under root and SupplementaryGroups= is empty.
1038 */
1039 if (k == 0) {
1040 *ngids = 0;
1041 return 0;
1042 }
1043
1044 /* Otherwise get the final list of supplementary groups */
1045 groups = memdup(l_gids, sizeof(gid_t) * k);
1046 if (!groups)
1047 return -ENOMEM;
1048
1049 *supplementary_gids = groups;
1050 *ngids = k;
1051
1052 groups = NULL;
1053
1054 return 0;
1055 }
1056
1057 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1058 int r;
1059
1060 /* Handle SupplementaryGroups= if it is not empty */
1061 if (ngids > 0) {
1062 r = maybe_setgroups(ngids, supplementary_gids);
1063 if (r < 0)
1064 return r;
1065 }
1066
1067 if (gid_is_valid(gid)) {
1068 /* Then set our gids */
1069 if (setresgid(gid, gid, gid) < 0)
1070 return -errno;
1071 }
1072
1073 return 0;
1074 }
1075
1076 static int enforce_user(const ExecContext *context, uid_t uid) {
1077 assert(context);
1078
1079 if (!uid_is_valid(uid))
1080 return 0;
1081
1082 /* Sets (but doesn't look up) the uid and make sure we keep the
1083 * capabilities while doing so. */
1084
1085 if (context->capability_ambient_set != 0) {
1086
1087 /* First step: If we need to keep capabilities but
1088 * drop privileges we need to make sure we keep our
1089 * caps, while we drop privileges. */
1090 if (uid != 0) {
1091 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1092
1093 if (prctl(PR_GET_SECUREBITS) != sb)
1094 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1095 return -errno;
1096 }
1097 }
1098
1099 /* Second step: actually set the uids */
1100 if (setresuid(uid, uid, uid) < 0)
1101 return -errno;
1102
1103 /* At this point we should have all necessary capabilities but
1104 are otherwise a normal user. However, the caps might got
1105 corrupted due to the setresuid() so we need clean them up
1106 later. This is done outside of this call. */
1107
1108 return 0;
1109 }
1110
1111 #if HAVE_PAM
1112
1113 static int null_conv(
1114 int num_msg,
1115 const struct pam_message **msg,
1116 struct pam_response **resp,
1117 void *appdata_ptr) {
1118
1119 /* We don't support conversations */
1120
1121 return PAM_CONV_ERR;
1122 }
1123
1124 #endif
1125
1126 static int setup_pam(
1127 const char *name,
1128 const char *user,
1129 uid_t uid,
1130 gid_t gid,
1131 const char *tty,
1132 char ***env,
1133 int fds[], size_t n_fds) {
1134
1135 #if HAVE_PAM
1136
1137 static const struct pam_conv conv = {
1138 .conv = null_conv,
1139 .appdata_ptr = NULL
1140 };
1141
1142 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1143 pam_handle_t *handle = NULL;
1144 sigset_t old_ss;
1145 int pam_code = PAM_SUCCESS, r;
1146 char **nv, **e = NULL;
1147 bool close_session = false;
1148 pid_t pam_pid = 0, parent_pid;
1149 int flags = 0;
1150
1151 assert(name);
1152 assert(user);
1153 assert(env);
1154
1155 /* We set up PAM in the parent process, then fork. The child
1156 * will then stay around until killed via PR_GET_PDEATHSIG or
1157 * systemd via the cgroup logic. It will then remove the PAM
1158 * session again. The parent process will exec() the actual
1159 * daemon. We do things this way to ensure that the main PID
1160 * of the daemon is the one we initially fork()ed. */
1161
1162 r = barrier_create(&barrier);
1163 if (r < 0)
1164 goto fail;
1165
1166 if (log_get_max_level() < LOG_DEBUG)
1167 flags |= PAM_SILENT;
1168
1169 pam_code = pam_start(name, user, &conv, &handle);
1170 if (pam_code != PAM_SUCCESS) {
1171 handle = NULL;
1172 goto fail;
1173 }
1174
1175 if (!tty) {
1176 _cleanup_free_ char *q = NULL;
1177
1178 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1179 * out if that's the case, and read the TTY off it. */
1180
1181 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1182 tty = strjoina("/dev/", q);
1183 }
1184
1185 if (tty) {
1186 pam_code = pam_set_item(handle, PAM_TTY, tty);
1187 if (pam_code != PAM_SUCCESS)
1188 goto fail;
1189 }
1190
1191 STRV_FOREACH(nv, *env) {
1192 pam_code = pam_putenv(handle, *nv);
1193 if (pam_code != PAM_SUCCESS)
1194 goto fail;
1195 }
1196
1197 pam_code = pam_acct_mgmt(handle, flags);
1198 if (pam_code != PAM_SUCCESS)
1199 goto fail;
1200
1201 pam_code = pam_open_session(handle, flags);
1202 if (pam_code != PAM_SUCCESS)
1203 goto fail;
1204
1205 close_session = true;
1206
1207 e = pam_getenvlist(handle);
1208 if (!e) {
1209 pam_code = PAM_BUF_ERR;
1210 goto fail;
1211 }
1212
1213 /* Block SIGTERM, so that we know that it won't get lost in
1214 * the child */
1215
1216 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1217
1218 parent_pid = getpid_cached();
1219
1220 r = safe_fork("(sd-pam)", 0, &pam_pid);
1221 if (r < 0)
1222 goto fail;
1223 if (r == 0) {
1224 int sig, ret = EXIT_PAM;
1225
1226 /* The child's job is to reset the PAM session on
1227 * termination */
1228 barrier_set_role(&barrier, BARRIER_CHILD);
1229
1230 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1231 * are open here that have been opened by PAM. */
1232 (void) close_many(fds, n_fds);
1233
1234 /* Drop privileges - we don't need any to pam_close_session
1235 * and this will make PR_SET_PDEATHSIG work in most cases.
1236 * If this fails, ignore the error - but expect sd-pam threads
1237 * to fail to exit normally */
1238
1239 r = maybe_setgroups(0, NULL);
1240 if (r < 0)
1241 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1242 if (setresgid(gid, gid, gid) < 0)
1243 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1244 if (setresuid(uid, uid, uid) < 0)
1245 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1246
1247 (void) ignore_signals(SIGPIPE, -1);
1248
1249 /* Wait until our parent died. This will only work if
1250 * the above setresuid() succeeds, otherwise the kernel
1251 * will not allow unprivileged parents kill their privileged
1252 * children this way. We rely on the control groups kill logic
1253 * to do the rest for us. */
1254 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1255 goto child_finish;
1256
1257 /* Tell the parent that our setup is done. This is especially
1258 * important regarding dropping privileges. Otherwise, unit
1259 * setup might race against our setresuid(2) call.
1260 *
1261 * If the parent aborted, we'll detect this below, hence ignore
1262 * return failure here. */
1263 (void) barrier_place(&barrier);
1264
1265 /* Check if our parent process might already have died? */
1266 if (getppid() == parent_pid) {
1267 sigset_t ss;
1268
1269 assert_se(sigemptyset(&ss) >= 0);
1270 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1271
1272 for (;;) {
1273 if (sigwait(&ss, &sig) < 0) {
1274 if (errno == EINTR)
1275 continue;
1276
1277 goto child_finish;
1278 }
1279
1280 assert(sig == SIGTERM);
1281 break;
1282 }
1283 }
1284
1285 /* If our parent died we'll end the session */
1286 if (getppid() != parent_pid) {
1287 pam_code = pam_close_session(handle, flags);
1288 if (pam_code != PAM_SUCCESS)
1289 goto child_finish;
1290 }
1291
1292 ret = 0;
1293
1294 child_finish:
1295 pam_end(handle, pam_code | flags);
1296 _exit(ret);
1297 }
1298
1299 barrier_set_role(&barrier, BARRIER_PARENT);
1300
1301 /* If the child was forked off successfully it will do all the
1302 * cleanups, so forget about the handle here. */
1303 handle = NULL;
1304
1305 /* Unblock SIGTERM again in the parent */
1306 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1307
1308 /* We close the log explicitly here, since the PAM modules
1309 * might have opened it, but we don't want this fd around. */
1310 closelog();
1311
1312 /* Synchronously wait for the child to initialize. We don't care for
1313 * errors as we cannot recover. However, warn loudly if it happens. */
1314 if (!barrier_place_and_sync(&barrier))
1315 log_error("PAM initialization failed");
1316
1317 return strv_free_and_replace(*env, e);
1318
1319 fail:
1320 if (pam_code != PAM_SUCCESS) {
1321 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1322 r = -EPERM; /* PAM errors do not map to errno */
1323 } else
1324 log_error_errno(r, "PAM failed: %m");
1325
1326 if (handle) {
1327 if (close_session)
1328 pam_code = pam_close_session(handle, flags);
1329
1330 pam_end(handle, pam_code | flags);
1331 }
1332
1333 strv_free(e);
1334 closelog();
1335
1336 return r;
1337 #else
1338 return 0;
1339 #endif
1340 }
1341
1342 static void rename_process_from_path(const char *path) {
1343 char process_name[11];
1344 const char *p;
1345 size_t l;
1346
1347 /* This resulting string must fit in 10 chars (i.e. the length
1348 * of "/sbin/init") to look pretty in /bin/ps */
1349
1350 p = basename(path);
1351 if (isempty(p)) {
1352 rename_process("(...)");
1353 return;
1354 }
1355
1356 l = strlen(p);
1357 if (l > 8) {
1358 /* The end of the process name is usually more
1359 * interesting, since the first bit might just be
1360 * "systemd-" */
1361 p = p + l - 8;
1362 l = 8;
1363 }
1364
1365 process_name[0] = '(';
1366 memcpy(process_name+1, p, l);
1367 process_name[1+l] = ')';
1368 process_name[1+l+1] = 0;
1369
1370 rename_process(process_name);
1371 }
1372
1373 static bool context_has_address_families(const ExecContext *c) {
1374 assert(c);
1375
1376 return c->address_families_whitelist ||
1377 !set_isempty(c->address_families);
1378 }
1379
1380 static bool context_has_syscall_filters(const ExecContext *c) {
1381 assert(c);
1382
1383 return c->syscall_whitelist ||
1384 !hashmap_isempty(c->syscall_filter);
1385 }
1386
1387 static bool context_has_no_new_privileges(const ExecContext *c) {
1388 assert(c);
1389
1390 if (c->no_new_privileges)
1391 return true;
1392
1393 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1394 return false;
1395
1396 /* We need NNP if we have any form of seccomp and are unprivileged */
1397 return context_has_address_families(c) ||
1398 c->memory_deny_write_execute ||
1399 c->restrict_realtime ||
1400 c->restrict_suid_sgid ||
1401 exec_context_restrict_namespaces_set(c) ||
1402 c->protect_kernel_tunables ||
1403 c->protect_kernel_modules ||
1404 c->private_devices ||
1405 context_has_syscall_filters(c) ||
1406 !set_isempty(c->syscall_archs) ||
1407 c->lock_personality ||
1408 c->protect_hostname;
1409 }
1410
1411 #if HAVE_SECCOMP
1412
1413 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1414
1415 if (is_seccomp_available())
1416 return false;
1417
1418 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1419 return true;
1420 }
1421
1422 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1423 uint32_t negative_action, default_action, action;
1424 int r;
1425
1426 assert(u);
1427 assert(c);
1428
1429 if (!context_has_syscall_filters(c))
1430 return 0;
1431
1432 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1433 return 0;
1434
1435 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1436
1437 if (c->syscall_whitelist) {
1438 default_action = negative_action;
1439 action = SCMP_ACT_ALLOW;
1440 } else {
1441 default_action = SCMP_ACT_ALLOW;
1442 action = negative_action;
1443 }
1444
1445 if (needs_ambient_hack) {
1446 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1447 if (r < 0)
1448 return r;
1449 }
1450
1451 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1452 }
1453
1454 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1455 assert(u);
1456 assert(c);
1457
1458 if (set_isempty(c->syscall_archs))
1459 return 0;
1460
1461 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1462 return 0;
1463
1464 return seccomp_restrict_archs(c->syscall_archs);
1465 }
1466
1467 static int apply_address_families(const Unit* u, const ExecContext *c) {
1468 assert(u);
1469 assert(c);
1470
1471 if (!context_has_address_families(c))
1472 return 0;
1473
1474 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1475 return 0;
1476
1477 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1478 }
1479
1480 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1481 assert(u);
1482 assert(c);
1483
1484 if (!c->memory_deny_write_execute)
1485 return 0;
1486
1487 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1488 return 0;
1489
1490 return seccomp_memory_deny_write_execute();
1491 }
1492
1493 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1494 assert(u);
1495 assert(c);
1496
1497 if (!c->restrict_realtime)
1498 return 0;
1499
1500 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1501 return 0;
1502
1503 return seccomp_restrict_realtime();
1504 }
1505
1506 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1507 assert(u);
1508 assert(c);
1509
1510 if (!c->restrict_suid_sgid)
1511 return 0;
1512
1513 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1514 return 0;
1515
1516 return seccomp_restrict_suid_sgid();
1517 }
1518
1519 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1520 assert(u);
1521 assert(c);
1522
1523 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1524 * let's protect even those systems where this is left on in the kernel. */
1525
1526 if (!c->protect_kernel_tunables)
1527 return 0;
1528
1529 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1530 return 0;
1531
1532 return seccomp_protect_sysctl();
1533 }
1534
1535 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1536 assert(u);
1537 assert(c);
1538
1539 /* Turn off module syscalls on ProtectKernelModules=yes */
1540
1541 if (!c->protect_kernel_modules)
1542 return 0;
1543
1544 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1545 return 0;
1546
1547 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1548 }
1549
1550 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1551 assert(u);
1552 assert(c);
1553
1554 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1555
1556 if (!c->private_devices)
1557 return 0;
1558
1559 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1560 return 0;
1561
1562 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1563 }
1564
1565 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1566 assert(u);
1567 assert(c);
1568
1569 if (!exec_context_restrict_namespaces_set(c))
1570 return 0;
1571
1572 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1573 return 0;
1574
1575 return seccomp_restrict_namespaces(c->restrict_namespaces);
1576 }
1577
1578 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1579 unsigned long personality;
1580 int r;
1581
1582 assert(u);
1583 assert(c);
1584
1585 if (!c->lock_personality)
1586 return 0;
1587
1588 if (skip_seccomp_unavailable(u, "LockPersonality="))
1589 return 0;
1590
1591 personality = c->personality;
1592
1593 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1594 if (personality == PERSONALITY_INVALID) {
1595
1596 r = opinionated_personality(&personality);
1597 if (r < 0)
1598 return r;
1599 }
1600
1601 return seccomp_lock_personality(personality);
1602 }
1603
1604 #endif
1605
1606 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1607 assert(idle_pipe);
1608
1609 idle_pipe[1] = safe_close(idle_pipe[1]);
1610 idle_pipe[2] = safe_close(idle_pipe[2]);
1611
1612 if (idle_pipe[0] >= 0) {
1613 int r;
1614
1615 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1616
1617 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1618 ssize_t n;
1619
1620 /* Signal systemd that we are bored and want to continue. */
1621 n = write(idle_pipe[3], "x", 1);
1622 if (n > 0)
1623 /* Wait for systemd to react to the signal above. */
1624 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1625 }
1626
1627 idle_pipe[0] = safe_close(idle_pipe[0]);
1628
1629 }
1630
1631 idle_pipe[3] = safe_close(idle_pipe[3]);
1632 }
1633
1634 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1635
1636 static int build_environment(
1637 const Unit *u,
1638 const ExecContext *c,
1639 const ExecParameters *p,
1640 size_t n_fds,
1641 const char *home,
1642 const char *username,
1643 const char *shell,
1644 dev_t journal_stream_dev,
1645 ino_t journal_stream_ino,
1646 char ***ret) {
1647
1648 _cleanup_strv_free_ char **our_env = NULL;
1649 ExecDirectoryType t;
1650 size_t n_env = 0;
1651 char *x;
1652
1653 assert(u);
1654 assert(c);
1655 assert(p);
1656 assert(ret);
1657
1658 our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1659 if (!our_env)
1660 return -ENOMEM;
1661
1662 if (n_fds > 0) {
1663 _cleanup_free_ char *joined = NULL;
1664
1665 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1666 return -ENOMEM;
1667 our_env[n_env++] = x;
1668
1669 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1670 return -ENOMEM;
1671 our_env[n_env++] = x;
1672
1673 joined = strv_join(p->fd_names, ":");
1674 if (!joined)
1675 return -ENOMEM;
1676
1677 x = strjoin("LISTEN_FDNAMES=", joined);
1678 if (!x)
1679 return -ENOMEM;
1680 our_env[n_env++] = x;
1681 }
1682
1683 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1684 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1685 return -ENOMEM;
1686 our_env[n_env++] = x;
1687
1688 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1689 return -ENOMEM;
1690 our_env[n_env++] = x;
1691 }
1692
1693 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1694 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1695 * check the database directly. */
1696 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1697 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1698 if (!x)
1699 return -ENOMEM;
1700 our_env[n_env++] = x;
1701 }
1702
1703 if (home) {
1704 x = strjoin("HOME=", home);
1705 if (!x)
1706 return -ENOMEM;
1707
1708 path_simplify(x + 5, true);
1709 our_env[n_env++] = x;
1710 }
1711
1712 if (username) {
1713 x = strjoin("LOGNAME=", username);
1714 if (!x)
1715 return -ENOMEM;
1716 our_env[n_env++] = x;
1717
1718 x = strjoin("USER=", username);
1719 if (!x)
1720 return -ENOMEM;
1721 our_env[n_env++] = x;
1722 }
1723
1724 if (shell) {
1725 x = strjoin("SHELL=", shell);
1726 if (!x)
1727 return -ENOMEM;
1728
1729 path_simplify(x + 6, true);
1730 our_env[n_env++] = x;
1731 }
1732
1733 if (!sd_id128_is_null(u->invocation_id)) {
1734 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1735 return -ENOMEM;
1736
1737 our_env[n_env++] = x;
1738 }
1739
1740 if (exec_context_needs_term(c)) {
1741 const char *tty_path, *term = NULL;
1742
1743 tty_path = exec_context_tty_path(c);
1744
1745 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1746 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1747 * passes to PID 1 ends up all the way in the console login shown. */
1748
1749 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1750 term = getenv("TERM");
1751 if (!term)
1752 term = default_term_for_tty(tty_path);
1753
1754 x = strjoin("TERM=", term);
1755 if (!x)
1756 return -ENOMEM;
1757 our_env[n_env++] = x;
1758 }
1759
1760 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1761 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1762 return -ENOMEM;
1763
1764 our_env[n_env++] = x;
1765 }
1766
1767 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1768 _cleanup_free_ char *pre = NULL, *joined = NULL;
1769 const char *n;
1770
1771 if (!p->prefix[t])
1772 continue;
1773
1774 if (strv_isempty(c->directories[t].paths))
1775 continue;
1776
1777 n = exec_directory_env_name_to_string(t);
1778 if (!n)
1779 continue;
1780
1781 pre = strjoin(p->prefix[t], "/");
1782 if (!pre)
1783 return -ENOMEM;
1784
1785 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1786 if (!joined)
1787 return -ENOMEM;
1788
1789 x = strjoin(n, "=", joined);
1790 if (!x)
1791 return -ENOMEM;
1792
1793 our_env[n_env++] = x;
1794 }
1795
1796 our_env[n_env++] = NULL;
1797 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1798
1799 *ret = TAKE_PTR(our_env);
1800
1801 return 0;
1802 }
1803
1804 static int build_pass_environment(const ExecContext *c, char ***ret) {
1805 _cleanup_strv_free_ char **pass_env = NULL;
1806 size_t n_env = 0, n_bufsize = 0;
1807 char **i;
1808
1809 STRV_FOREACH(i, c->pass_environment) {
1810 _cleanup_free_ char *x = NULL;
1811 char *v;
1812
1813 v = getenv(*i);
1814 if (!v)
1815 continue;
1816 x = strjoin(*i, "=", v);
1817 if (!x)
1818 return -ENOMEM;
1819
1820 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1821 return -ENOMEM;
1822
1823 pass_env[n_env++] = TAKE_PTR(x);
1824 pass_env[n_env] = NULL;
1825 }
1826
1827 *ret = TAKE_PTR(pass_env);
1828
1829 return 0;
1830 }
1831
1832 static bool exec_needs_mount_namespace(
1833 const ExecContext *context,
1834 const ExecParameters *params,
1835 const ExecRuntime *runtime) {
1836
1837 assert(context);
1838 assert(params);
1839
1840 if (context->root_image)
1841 return true;
1842
1843 if (!strv_isempty(context->read_write_paths) ||
1844 !strv_isempty(context->read_only_paths) ||
1845 !strv_isempty(context->inaccessible_paths))
1846 return true;
1847
1848 if (context->n_bind_mounts > 0)
1849 return true;
1850
1851 if (context->n_temporary_filesystems > 0)
1852 return true;
1853
1854 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1855 return true;
1856
1857 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1858 return true;
1859
1860 if (context->private_devices ||
1861 context->private_mounts ||
1862 context->protect_system != PROTECT_SYSTEM_NO ||
1863 context->protect_home != PROTECT_HOME_NO ||
1864 context->protect_kernel_tunables ||
1865 context->protect_kernel_modules ||
1866 context->protect_control_groups)
1867 return true;
1868
1869 if (context->root_directory) {
1870 ExecDirectoryType t;
1871
1872 if (context->mount_apivfs)
1873 return true;
1874
1875 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1876 if (!params->prefix[t])
1877 continue;
1878
1879 if (!strv_isempty(context->directories[t].paths))
1880 return true;
1881 }
1882 }
1883
1884 if (context->dynamic_user &&
1885 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1886 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1887 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1888 return true;
1889
1890 return false;
1891 }
1892
1893 static int setup_private_users(uid_t uid, gid_t gid) {
1894 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1895 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1896 _cleanup_close_ int unshare_ready_fd = -1;
1897 _cleanup_(sigkill_waitp) pid_t pid = 0;
1898 uint64_t c = 1;
1899 ssize_t n;
1900 int r;
1901
1902 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1903 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1904 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1905 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1906 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1907 * continues execution normally. */
1908
1909 if (uid != 0 && uid_is_valid(uid)) {
1910 r = asprintf(&uid_map,
1911 "0 0 1\n" /* Map root → root */
1912 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1913 uid, uid);
1914 if (r < 0)
1915 return -ENOMEM;
1916 } else {
1917 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1918 if (!uid_map)
1919 return -ENOMEM;
1920 }
1921
1922 if (gid != 0 && gid_is_valid(gid)) {
1923 r = asprintf(&gid_map,
1924 "0 0 1\n" /* Map root → root */
1925 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1926 gid, gid);
1927 if (r < 0)
1928 return -ENOMEM;
1929 } else {
1930 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1931 if (!gid_map)
1932 return -ENOMEM;
1933 }
1934
1935 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1936 * namespace. */
1937 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1938 if (unshare_ready_fd < 0)
1939 return -errno;
1940
1941 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1942 * failed. */
1943 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1944 return -errno;
1945
1946 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1947 if (r < 0)
1948 return r;
1949 if (r == 0) {
1950 _cleanup_close_ int fd = -1;
1951 const char *a;
1952 pid_t ppid;
1953
1954 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1955 * here, after the parent opened its own user namespace. */
1956
1957 ppid = getppid();
1958 errno_pipe[0] = safe_close(errno_pipe[0]);
1959
1960 /* Wait until the parent unshared the user namespace */
1961 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1962 r = -errno;
1963 goto child_fail;
1964 }
1965
1966 /* Disable the setgroups() system call in the child user namespace, for good. */
1967 a = procfs_file_alloca(ppid, "setgroups");
1968 fd = open(a, O_WRONLY|O_CLOEXEC);
1969 if (fd < 0) {
1970 if (errno != ENOENT) {
1971 r = -errno;
1972 goto child_fail;
1973 }
1974
1975 /* If the file is missing the kernel is too old, let's continue anyway. */
1976 } else {
1977 if (write(fd, "deny\n", 5) < 0) {
1978 r = -errno;
1979 goto child_fail;
1980 }
1981
1982 fd = safe_close(fd);
1983 }
1984
1985 /* First write the GID map */
1986 a = procfs_file_alloca(ppid, "gid_map");
1987 fd = open(a, O_WRONLY|O_CLOEXEC);
1988 if (fd < 0) {
1989 r = -errno;
1990 goto child_fail;
1991 }
1992 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1993 r = -errno;
1994 goto child_fail;
1995 }
1996 fd = safe_close(fd);
1997
1998 /* The write the UID map */
1999 a = procfs_file_alloca(ppid, "uid_map");
2000 fd = open(a, O_WRONLY|O_CLOEXEC);
2001 if (fd < 0) {
2002 r = -errno;
2003 goto child_fail;
2004 }
2005 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2006 r = -errno;
2007 goto child_fail;
2008 }
2009
2010 _exit(EXIT_SUCCESS);
2011
2012 child_fail:
2013 (void) write(errno_pipe[1], &r, sizeof(r));
2014 _exit(EXIT_FAILURE);
2015 }
2016
2017 errno_pipe[1] = safe_close(errno_pipe[1]);
2018
2019 if (unshare(CLONE_NEWUSER) < 0)
2020 return -errno;
2021
2022 /* Let the child know that the namespace is ready now */
2023 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2024 return -errno;
2025
2026 /* Try to read an error code from the child */
2027 n = read(errno_pipe[0], &r, sizeof(r));
2028 if (n < 0)
2029 return -errno;
2030 if (n == sizeof(r)) { /* an error code was sent to us */
2031 if (r < 0)
2032 return r;
2033 return -EIO;
2034 }
2035 if (n != 0) /* on success we should have read 0 bytes */
2036 return -EIO;
2037
2038 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2039 pid = 0;
2040 if (r < 0)
2041 return r;
2042 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2043 return -EIO;
2044
2045 return 0;
2046 }
2047
2048 static int setup_exec_directory(
2049 const ExecContext *context,
2050 const ExecParameters *params,
2051 uid_t uid,
2052 gid_t gid,
2053 ExecDirectoryType type,
2054 int *exit_status) {
2055
2056 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2057 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2058 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2059 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2060 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2061 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2062 };
2063 char **rt;
2064 int r;
2065
2066 assert(context);
2067 assert(params);
2068 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2069 assert(exit_status);
2070
2071 if (!params->prefix[type])
2072 return 0;
2073
2074 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2075 if (!uid_is_valid(uid))
2076 uid = 0;
2077 if (!gid_is_valid(gid))
2078 gid = 0;
2079 }
2080
2081 STRV_FOREACH(rt, context->directories[type].paths) {
2082 _cleanup_free_ char *p = NULL, *pp = NULL;
2083
2084 p = path_join(params->prefix[type], *rt);
2085 if (!p) {
2086 r = -ENOMEM;
2087 goto fail;
2088 }
2089
2090 r = mkdir_parents_label(p, 0755);
2091 if (r < 0)
2092 goto fail;
2093
2094 if (context->dynamic_user &&
2095 (!IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) ||
2096 (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode != EXEC_PRESERVE_NO))) {
2097 _cleanup_free_ char *private_root = NULL;
2098
2099 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2100 * case we want to avoid leaving a directory around fully accessible that is owned by
2101 * a dynamic user whose UID is later on reused. To lock this down we use the same
2102 * trick used by container managers to prohibit host users to get access to files of
2103 * the same UID in containers: we place everything inside a directory that has an
2104 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2105 * for unprivileged host code. We then use fs namespacing to make this directory
2106 * permeable for the service itself.
2107 *
2108 * Specifically: for a service which wants a special directory "foo/" we first create
2109 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2110 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2111 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2112 * unprivileged host users can't look into it. Inside of the namespace of the unit
2113 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2114 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2115 * for the service and making sure it only gets access to the dirs it needs but no
2116 * others. Tricky? Yes, absolutely, but it works!
2117 *
2118 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2119 * to be owned by the service itself.
2120 *
2121 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2122 * for sharing files or sockets with other services. */
2123
2124 private_root = path_join(params->prefix[type], "private");
2125 if (!private_root) {
2126 r = -ENOMEM;
2127 goto fail;
2128 }
2129
2130 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2131 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2132 if (r < 0)
2133 goto fail;
2134
2135 pp = path_join(private_root, *rt);
2136 if (!pp) {
2137 r = -ENOMEM;
2138 goto fail;
2139 }
2140
2141 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2142 r = mkdir_parents_label(pp, 0755);
2143 if (r < 0)
2144 goto fail;
2145
2146 if (is_dir(p, false) > 0 &&
2147 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2148
2149 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2150 * it over. Most likely the service has been upgraded from one that didn't use
2151 * DynamicUser=1, to one that does. */
2152
2153 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2154 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2155 exec_directory_type_to_string(type), p, pp);
2156
2157 if (rename(p, pp) < 0) {
2158 r = -errno;
2159 goto fail;
2160 }
2161 } else {
2162 /* Otherwise, create the actual directory for the service */
2163
2164 r = mkdir_label(pp, context->directories[type].mode);
2165 if (r < 0 && r != -EEXIST)
2166 goto fail;
2167 }
2168
2169 /* And link it up from the original place */
2170 r = symlink_idempotent(pp, p, true);
2171 if (r < 0)
2172 goto fail;
2173
2174 } else {
2175 _cleanup_free_ char *target = NULL;
2176
2177 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2178 readlink_and_make_absolute(p, &target) >= 0) {
2179 _cleanup_free_ char *q = NULL;
2180
2181 /* This already exists and is a symlink? Interesting. Maybe it's one created
2182 * by DynamicUser=1 (see above)?
2183 *
2184 * We do this for all directory types except for ConfigurationDirectory=,
2185 * since they all support the private/ symlink logic at least in some
2186 * configurations, see above. */
2187
2188 q = path_join(params->prefix[type], "private", *rt);
2189 if (!q) {
2190 r = -ENOMEM;
2191 goto fail;
2192 }
2193
2194 if (path_equal(q, target)) {
2195
2196 /* Hmm, apparently DynamicUser= was once turned on for this service,
2197 * but is no longer. Let's move the directory back up. */
2198
2199 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2200 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2201 exec_directory_type_to_string(type), q, p);
2202
2203 if (unlink(p) < 0) {
2204 r = -errno;
2205 goto fail;
2206 }
2207
2208 if (rename(q, p) < 0) {
2209 r = -errno;
2210 goto fail;
2211 }
2212 }
2213 }
2214
2215 r = mkdir_label(p, context->directories[type].mode);
2216 if (r < 0) {
2217 if (r != -EEXIST)
2218 goto fail;
2219
2220 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2221 struct stat st;
2222
2223 /* Don't change the owner/access mode of the configuration directory,
2224 * as in the common case it is not written to by a service, and shall
2225 * not be writable. */
2226
2227 if (stat(p, &st) < 0) {
2228 r = -errno;
2229 goto fail;
2230 }
2231
2232 /* Still complain if the access mode doesn't match */
2233 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2234 log_warning("%s \'%s\' already exists but the mode is different. "
2235 "(File system: %o %sMode: %o)",
2236 exec_directory_type_to_string(type), *rt,
2237 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2238
2239 continue;
2240 }
2241 }
2242 }
2243
2244 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2245 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2246 * current UID/GID ownership.) */
2247 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2248 if (r < 0)
2249 goto fail;
2250
2251 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2252 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2253 * assignments to exist.*/
2254 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2255 if (r < 0)
2256 goto fail;
2257 }
2258
2259 return 0;
2260
2261 fail:
2262 *exit_status = exit_status_table[type];
2263 return r;
2264 }
2265
2266 #if ENABLE_SMACK
2267 static int setup_smack(
2268 const ExecContext *context,
2269 const ExecCommand *command) {
2270
2271 int r;
2272
2273 assert(context);
2274 assert(command);
2275
2276 if (context->smack_process_label) {
2277 r = mac_smack_apply_pid(0, context->smack_process_label);
2278 if (r < 0)
2279 return r;
2280 }
2281 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2282 else {
2283 _cleanup_free_ char *exec_label = NULL;
2284
2285 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2286 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2287 return r;
2288
2289 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2290 if (r < 0)
2291 return r;
2292 }
2293 #endif
2294
2295 return 0;
2296 }
2297 #endif
2298
2299 static int compile_bind_mounts(
2300 const ExecContext *context,
2301 const ExecParameters *params,
2302 BindMount **ret_bind_mounts,
2303 size_t *ret_n_bind_mounts,
2304 char ***ret_empty_directories) {
2305
2306 _cleanup_strv_free_ char **empty_directories = NULL;
2307 BindMount *bind_mounts;
2308 size_t n, h = 0, i;
2309 ExecDirectoryType t;
2310 int r;
2311
2312 assert(context);
2313 assert(params);
2314 assert(ret_bind_mounts);
2315 assert(ret_n_bind_mounts);
2316 assert(ret_empty_directories);
2317
2318 n = context->n_bind_mounts;
2319 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2320 if (!params->prefix[t])
2321 continue;
2322
2323 n += strv_length(context->directories[t].paths);
2324 }
2325
2326 if (n <= 0) {
2327 *ret_bind_mounts = NULL;
2328 *ret_n_bind_mounts = 0;
2329 *ret_empty_directories = NULL;
2330 return 0;
2331 }
2332
2333 bind_mounts = new(BindMount, n);
2334 if (!bind_mounts)
2335 return -ENOMEM;
2336
2337 for (i = 0; i < context->n_bind_mounts; i++) {
2338 BindMount *item = context->bind_mounts + i;
2339 char *s, *d;
2340
2341 s = strdup(item->source);
2342 if (!s) {
2343 r = -ENOMEM;
2344 goto finish;
2345 }
2346
2347 d = strdup(item->destination);
2348 if (!d) {
2349 free(s);
2350 r = -ENOMEM;
2351 goto finish;
2352 }
2353
2354 bind_mounts[h++] = (BindMount) {
2355 .source = s,
2356 .destination = d,
2357 .read_only = item->read_only,
2358 .recursive = item->recursive,
2359 .ignore_enoent = item->ignore_enoent,
2360 };
2361 }
2362
2363 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2364 char **suffix;
2365
2366 if (!params->prefix[t])
2367 continue;
2368
2369 if (strv_isempty(context->directories[t].paths))
2370 continue;
2371
2372 if (context->dynamic_user &&
2373 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2374 !(context->root_directory || context->root_image)) {
2375 char *private_root;
2376
2377 /* So this is for a dynamic user, and we need to make sure the process can access its own
2378 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2379 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2380
2381 private_root = path_join(params->prefix[t], "private");
2382 if (!private_root) {
2383 r = -ENOMEM;
2384 goto finish;
2385 }
2386
2387 r = strv_consume(&empty_directories, private_root);
2388 if (r < 0)
2389 goto finish;
2390 }
2391
2392 STRV_FOREACH(suffix, context->directories[t].paths) {
2393 char *s, *d;
2394
2395 if (context->dynamic_user &&
2396 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2397 s = path_join(params->prefix[t], "private", *suffix);
2398 else
2399 s = path_join(params->prefix[t], *suffix);
2400 if (!s) {
2401 r = -ENOMEM;
2402 goto finish;
2403 }
2404
2405 if (context->dynamic_user &&
2406 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2407 (context->root_directory || context->root_image))
2408 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2409 * directory is not created on the root directory. So, let's bind-mount the directory
2410 * on the 'non-private' place. */
2411 d = path_join(params->prefix[t], *suffix);
2412 else
2413 d = strdup(s);
2414 if (!d) {
2415 free(s);
2416 r = -ENOMEM;
2417 goto finish;
2418 }
2419
2420 bind_mounts[h++] = (BindMount) {
2421 .source = s,
2422 .destination = d,
2423 .read_only = false,
2424 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2425 .recursive = true,
2426 .ignore_enoent = false,
2427 };
2428 }
2429 }
2430
2431 assert(h == n);
2432
2433 *ret_bind_mounts = bind_mounts;
2434 *ret_n_bind_mounts = n;
2435 *ret_empty_directories = TAKE_PTR(empty_directories);
2436
2437 return (int) n;
2438
2439 finish:
2440 bind_mount_free_many(bind_mounts, h);
2441 return r;
2442 }
2443
2444 static int apply_mount_namespace(
2445 const Unit *u,
2446 const ExecCommand *command,
2447 const ExecContext *context,
2448 const ExecParameters *params,
2449 const ExecRuntime *runtime,
2450 char **error_path) {
2451
2452 _cleanup_strv_free_ char **empty_directories = NULL;
2453 char *tmp = NULL, *var = NULL;
2454 const char *root_dir = NULL, *root_image = NULL;
2455 NamespaceInfo ns_info;
2456 bool needs_sandboxing;
2457 BindMount *bind_mounts = NULL;
2458 size_t n_bind_mounts = 0;
2459 int r;
2460
2461 assert(context);
2462
2463 /* The runtime struct only contains the parent of the private /tmp,
2464 * which is non-accessible to world users. Inside of it there's a /tmp
2465 * that is sticky, and that's the one we want to use here. */
2466
2467 if (context->private_tmp && runtime) {
2468 if (runtime->tmp_dir)
2469 tmp = strjoina(runtime->tmp_dir, "/tmp");
2470 if (runtime->var_tmp_dir)
2471 var = strjoina(runtime->var_tmp_dir, "/tmp");
2472 }
2473
2474 if (params->flags & EXEC_APPLY_CHROOT) {
2475 root_image = context->root_image;
2476
2477 if (!root_image)
2478 root_dir = context->root_directory;
2479 }
2480
2481 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2482 if (r < 0)
2483 return r;
2484
2485 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2486 if (needs_sandboxing)
2487 ns_info = (NamespaceInfo) {
2488 .ignore_protect_paths = false,
2489 .private_dev = context->private_devices,
2490 .protect_control_groups = context->protect_control_groups,
2491 .protect_kernel_tunables = context->protect_kernel_tunables,
2492 .protect_kernel_modules = context->protect_kernel_modules,
2493 .protect_hostname = context->protect_hostname,
2494 .mount_apivfs = context->mount_apivfs,
2495 .private_mounts = context->private_mounts,
2496 };
2497 else if (!context->dynamic_user && root_dir)
2498 /*
2499 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2500 * sandbox info, otherwise enforce it, don't ignore protected paths and
2501 * fail if we are enable to apply the sandbox inside the mount namespace.
2502 */
2503 ns_info = (NamespaceInfo) {
2504 .ignore_protect_paths = true,
2505 };
2506 else
2507 ns_info = (NamespaceInfo) {};
2508
2509 if (context->mount_flags == MS_SHARED)
2510 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2511
2512 r = setup_namespace(root_dir, root_image,
2513 &ns_info, context->read_write_paths,
2514 needs_sandboxing ? context->read_only_paths : NULL,
2515 needs_sandboxing ? context->inaccessible_paths : NULL,
2516 empty_directories,
2517 bind_mounts,
2518 n_bind_mounts,
2519 context->temporary_filesystems,
2520 context->n_temporary_filesystems,
2521 tmp,
2522 var,
2523 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2524 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2525 context->mount_flags,
2526 DISSECT_IMAGE_DISCARD_ON_LOOP,
2527 error_path);
2528
2529 bind_mount_free_many(bind_mounts, n_bind_mounts);
2530
2531 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2532 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2533 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2534 * completely different execution environment. */
2535 if (r == -ENOANO) {
2536 if (n_bind_mounts == 0 &&
2537 context->n_temporary_filesystems == 0 &&
2538 !root_dir && !root_image &&
2539 !context->dynamic_user) {
2540 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2541 return 0;
2542 }
2543
2544 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2545 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2546 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2547
2548 return -EOPNOTSUPP;
2549 }
2550
2551 return r;
2552 }
2553
2554 static int apply_working_directory(
2555 const ExecContext *context,
2556 const ExecParameters *params,
2557 const char *home,
2558 int *exit_status) {
2559
2560 const char *d, *wd;
2561
2562 assert(context);
2563 assert(exit_status);
2564
2565 if (context->working_directory_home) {
2566
2567 if (!home) {
2568 *exit_status = EXIT_CHDIR;
2569 return -ENXIO;
2570 }
2571
2572 wd = home;
2573
2574 } else if (context->working_directory)
2575 wd = context->working_directory;
2576 else
2577 wd = "/";
2578
2579 if (params->flags & EXEC_APPLY_CHROOT)
2580 d = wd;
2581 else
2582 d = prefix_roota(context->root_directory, wd);
2583
2584 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2585 *exit_status = EXIT_CHDIR;
2586 return -errno;
2587 }
2588
2589 return 0;
2590 }
2591
2592 static int apply_root_directory(
2593 const ExecContext *context,
2594 const ExecParameters *params,
2595 const bool needs_mount_ns,
2596 int *exit_status) {
2597
2598 assert(context);
2599 assert(exit_status);
2600
2601 if (params->flags & EXEC_APPLY_CHROOT) {
2602 if (!needs_mount_ns && context->root_directory)
2603 if (chroot(context->root_directory) < 0) {
2604 *exit_status = EXIT_CHROOT;
2605 return -errno;
2606 }
2607 }
2608
2609 return 0;
2610 }
2611
2612 static int setup_keyring(
2613 const Unit *u,
2614 const ExecContext *context,
2615 const ExecParameters *p,
2616 uid_t uid, gid_t gid) {
2617
2618 key_serial_t keyring;
2619 int r = 0;
2620 uid_t saved_uid;
2621 gid_t saved_gid;
2622
2623 assert(u);
2624 assert(context);
2625 assert(p);
2626
2627 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2628 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2629 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2630 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2631 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2632 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2633
2634 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2635 return 0;
2636
2637 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2638 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2639 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2640 * & group is just as nasty as acquiring a reference to the user keyring. */
2641
2642 saved_uid = getuid();
2643 saved_gid = getgid();
2644
2645 if (gid_is_valid(gid) && gid != saved_gid) {
2646 if (setregid(gid, -1) < 0)
2647 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2648 }
2649
2650 if (uid_is_valid(uid) && uid != saved_uid) {
2651 if (setreuid(uid, -1) < 0) {
2652 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2653 goto out;
2654 }
2655 }
2656
2657 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2658 if (keyring == -1) {
2659 if (errno == ENOSYS)
2660 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2661 else if (IN_SET(errno, EACCES, EPERM))
2662 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2663 else if (errno == EDQUOT)
2664 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2665 else
2666 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2667
2668 goto out;
2669 }
2670
2671 /* When requested link the user keyring into the session keyring. */
2672 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2673
2674 if (keyctl(KEYCTL_LINK,
2675 KEY_SPEC_USER_KEYRING,
2676 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2677 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2678 goto out;
2679 }
2680 }
2681
2682 /* Restore uid/gid back */
2683 if (uid_is_valid(uid) && uid != saved_uid) {
2684 if (setreuid(saved_uid, -1) < 0) {
2685 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2686 goto out;
2687 }
2688 }
2689
2690 if (gid_is_valid(gid) && gid != saved_gid) {
2691 if (setregid(saved_gid, -1) < 0)
2692 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2693 }
2694
2695 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2696 if (!sd_id128_is_null(u->invocation_id)) {
2697 key_serial_t key;
2698
2699 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2700 if (key == -1)
2701 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2702 else {
2703 if (keyctl(KEYCTL_SETPERM, key,
2704 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2705 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2706 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2707 }
2708 }
2709
2710 out:
2711 /* Revert back uid & gid for the the last time, and exit */
2712 /* no extra logging, as only the first already reported error matters */
2713 if (getuid() != saved_uid)
2714 (void) setreuid(saved_uid, -1);
2715
2716 if (getgid() != saved_gid)
2717 (void) setregid(saved_gid, -1);
2718
2719 return r;
2720 }
2721
2722 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2723 assert(array);
2724 assert(n);
2725 assert(pair);
2726
2727 if (pair[0] >= 0)
2728 array[(*n)++] = pair[0];
2729 if (pair[1] >= 0)
2730 array[(*n)++] = pair[1];
2731 }
2732
2733 static int close_remaining_fds(
2734 const ExecParameters *params,
2735 const ExecRuntime *runtime,
2736 const DynamicCreds *dcreds,
2737 int user_lookup_fd,
2738 int socket_fd,
2739 int exec_fd,
2740 int *fds, size_t n_fds) {
2741
2742 size_t n_dont_close = 0;
2743 int dont_close[n_fds + 12];
2744
2745 assert(params);
2746
2747 if (params->stdin_fd >= 0)
2748 dont_close[n_dont_close++] = params->stdin_fd;
2749 if (params->stdout_fd >= 0)
2750 dont_close[n_dont_close++] = params->stdout_fd;
2751 if (params->stderr_fd >= 0)
2752 dont_close[n_dont_close++] = params->stderr_fd;
2753
2754 if (socket_fd >= 0)
2755 dont_close[n_dont_close++] = socket_fd;
2756 if (exec_fd >= 0)
2757 dont_close[n_dont_close++] = exec_fd;
2758 if (n_fds > 0) {
2759 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2760 n_dont_close += n_fds;
2761 }
2762
2763 if (runtime)
2764 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2765
2766 if (dcreds) {
2767 if (dcreds->user)
2768 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2769 if (dcreds->group)
2770 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2771 }
2772
2773 if (user_lookup_fd >= 0)
2774 dont_close[n_dont_close++] = user_lookup_fd;
2775
2776 return close_all_fds(dont_close, n_dont_close);
2777 }
2778
2779 static int send_user_lookup(
2780 Unit *unit,
2781 int user_lookup_fd,
2782 uid_t uid,
2783 gid_t gid) {
2784
2785 assert(unit);
2786
2787 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2788 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2789 * specified. */
2790
2791 if (user_lookup_fd < 0)
2792 return 0;
2793
2794 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2795 return 0;
2796
2797 if (writev(user_lookup_fd,
2798 (struct iovec[]) {
2799 IOVEC_INIT(&uid, sizeof(uid)),
2800 IOVEC_INIT(&gid, sizeof(gid)),
2801 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2802 return -errno;
2803
2804 return 0;
2805 }
2806
2807 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2808 int r;
2809
2810 assert(c);
2811 assert(home);
2812 assert(buf);
2813
2814 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2815
2816 if (*home)
2817 return 0;
2818
2819 if (!c->working_directory_home)
2820 return 0;
2821
2822 r = get_home_dir(buf);
2823 if (r < 0)
2824 return r;
2825
2826 *home = *buf;
2827 return 1;
2828 }
2829
2830 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2831 _cleanup_strv_free_ char ** list = NULL;
2832 ExecDirectoryType t;
2833 int r;
2834
2835 assert(c);
2836 assert(p);
2837 assert(ret);
2838
2839 assert(c->dynamic_user);
2840
2841 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2842 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2843 * directories. */
2844
2845 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2846 char **i;
2847
2848 if (t == EXEC_DIRECTORY_CONFIGURATION)
2849 continue;
2850
2851 if (!p->prefix[t])
2852 continue;
2853
2854 STRV_FOREACH(i, c->directories[t].paths) {
2855 char *e;
2856
2857 if (t == EXEC_DIRECTORY_RUNTIME)
2858 e = path_join(p->prefix[t], *i);
2859 else
2860 e = path_join(p->prefix[t], "private", *i);
2861 if (!e)
2862 return -ENOMEM;
2863
2864 r = strv_consume(&list, e);
2865 if (r < 0)
2866 return r;
2867 }
2868 }
2869
2870 *ret = TAKE_PTR(list);
2871
2872 return 0;
2873 }
2874
2875 static char *exec_command_line(char **argv);
2876
2877 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2878 bool using_subcgroup;
2879 char *p;
2880
2881 assert(params);
2882 assert(ret);
2883
2884 if (!params->cgroup_path)
2885 return -EINVAL;
2886
2887 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2888 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2889 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2890 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2891 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2892 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2893 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2894 * flag, which is only passed for the former statements, not for the latter. */
2895
2896 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2897 if (using_subcgroup)
2898 p = path_join(params->cgroup_path, ".control");
2899 else
2900 p = strdup(params->cgroup_path);
2901 if (!p)
2902 return -ENOMEM;
2903
2904 *ret = p;
2905 return using_subcgroup;
2906 }
2907
2908 static int exec_child(
2909 Unit *unit,
2910 const ExecCommand *command,
2911 const ExecContext *context,
2912 const ExecParameters *params,
2913 ExecRuntime *runtime,
2914 DynamicCreds *dcreds,
2915 int socket_fd,
2916 const int named_iofds[static 3],
2917 int *fds,
2918 size_t n_socket_fds,
2919 size_t n_storage_fds,
2920 char **files_env,
2921 int user_lookup_fd,
2922 int *exit_status) {
2923
2924 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
2925 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2926 _cleanup_free_ gid_t *supplementary_gids = NULL;
2927 const char *username = NULL, *groupname = NULL;
2928 _cleanup_free_ char *home_buffer = NULL;
2929 const char *home = NULL, *shell = NULL;
2930 char **final_argv = NULL;
2931 dev_t journal_stream_dev = 0;
2932 ino_t journal_stream_ino = 0;
2933 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2934 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2935 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2936 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2937 #if HAVE_SELINUX
2938 _cleanup_free_ char *mac_selinux_context_net = NULL;
2939 bool use_selinux = false;
2940 #endif
2941 #if ENABLE_SMACK
2942 bool use_smack = false;
2943 #endif
2944 #if HAVE_APPARMOR
2945 bool use_apparmor = false;
2946 #endif
2947 uid_t uid = UID_INVALID;
2948 gid_t gid = GID_INVALID;
2949 size_t n_fds;
2950 ExecDirectoryType dt;
2951 int secure_bits;
2952
2953 assert(unit);
2954 assert(command);
2955 assert(context);
2956 assert(params);
2957 assert(exit_status);
2958
2959 rename_process_from_path(command->path);
2960
2961 /* We reset exactly these signals, since they are the
2962 * only ones we set to SIG_IGN in the main daemon. All
2963 * others we leave untouched because we set them to
2964 * SIG_DFL or a valid handler initially, both of which
2965 * will be demoted to SIG_DFL. */
2966 (void) default_signals(SIGNALS_CRASH_HANDLER,
2967 SIGNALS_IGNORE, -1);
2968
2969 if (context->ignore_sigpipe)
2970 (void) ignore_signals(SIGPIPE, -1);
2971
2972 r = reset_signal_mask();
2973 if (r < 0) {
2974 *exit_status = EXIT_SIGNAL_MASK;
2975 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2976 }
2977
2978 if (params->idle_pipe)
2979 do_idle_pipe_dance(params->idle_pipe);
2980
2981 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2982 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2983 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2984 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2985
2986 log_forget_fds();
2987 log_set_open_when_needed(true);
2988
2989 /* In case anything used libc syslog(), close this here, too */
2990 closelog();
2991
2992 n_fds = n_socket_fds + n_storage_fds;
2993 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2994 if (r < 0) {
2995 *exit_status = EXIT_FDS;
2996 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2997 }
2998
2999 if (!context->same_pgrp)
3000 if (setsid() < 0) {
3001 *exit_status = EXIT_SETSID;
3002 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3003 }
3004
3005 exec_context_tty_reset(context, params);
3006
3007 if (unit_shall_confirm_spawn(unit)) {
3008 const char *vc = params->confirm_spawn;
3009 _cleanup_free_ char *cmdline = NULL;
3010
3011 cmdline = exec_command_line(command->argv);
3012 if (!cmdline) {
3013 *exit_status = EXIT_MEMORY;
3014 return log_oom();
3015 }
3016
3017 r = ask_for_confirmation(vc, unit, cmdline);
3018 if (r != CONFIRM_EXECUTE) {
3019 if (r == CONFIRM_PRETEND_SUCCESS) {
3020 *exit_status = EXIT_SUCCESS;
3021 return 0;
3022 }
3023 *exit_status = EXIT_CONFIRM;
3024 log_unit_error(unit, "Execution cancelled by the user");
3025 return -ECANCELED;
3026 }
3027 }
3028
3029 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3030 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3031 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3032 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3033 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3034 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3035 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3036 *exit_status = EXIT_MEMORY;
3037 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3038 }
3039
3040 if (context->dynamic_user && dcreds) {
3041 _cleanup_strv_free_ char **suggested_paths = NULL;
3042
3043 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3044 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3045 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3046 *exit_status = EXIT_USER;
3047 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3048 }
3049
3050 r = compile_suggested_paths(context, params, &suggested_paths);
3051 if (r < 0) {
3052 *exit_status = EXIT_MEMORY;
3053 return log_oom();
3054 }
3055
3056 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3057 if (r < 0) {
3058 *exit_status = EXIT_USER;
3059 if (r == -EILSEQ) {
3060 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3061 return -EOPNOTSUPP;
3062 }
3063 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3064 }
3065
3066 if (!uid_is_valid(uid)) {
3067 *exit_status = EXIT_USER;
3068 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3069 return -ESRCH;
3070 }
3071
3072 if (!gid_is_valid(gid)) {
3073 *exit_status = EXIT_USER;
3074 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3075 return -ESRCH;
3076 }
3077
3078 if (dcreds->user)
3079 username = dcreds->user->name;
3080
3081 } else {
3082 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3083 if (r < 0) {
3084 *exit_status = EXIT_USER;
3085 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3086 }
3087
3088 r = get_fixed_group(context, &groupname, &gid);
3089 if (r < 0) {
3090 *exit_status = EXIT_GROUP;
3091 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3092 }
3093 }
3094
3095 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3096 r = get_supplementary_groups(context, username, groupname, gid,
3097 &supplementary_gids, &ngids);
3098 if (r < 0) {
3099 *exit_status = EXIT_GROUP;
3100 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3101 }
3102
3103 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3104 if (r < 0) {
3105 *exit_status = EXIT_USER;
3106 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3107 }
3108
3109 user_lookup_fd = safe_close(user_lookup_fd);
3110
3111 r = acquire_home(context, uid, &home, &home_buffer);
3112 if (r < 0) {
3113 *exit_status = EXIT_CHDIR;
3114 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3115 }
3116
3117 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3118 * must sure to drop O_NONBLOCK */
3119 if (socket_fd >= 0)
3120 (void) fd_nonblock(socket_fd, false);
3121
3122 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3123 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3124 if (params->cgroup_path) {
3125 _cleanup_free_ char *p = NULL;
3126
3127 r = exec_parameters_get_cgroup_path(params, &p);
3128 if (r < 0) {
3129 *exit_status = EXIT_CGROUP;
3130 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3131 }
3132
3133 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3134 if (r < 0) {
3135 *exit_status = EXIT_CGROUP;
3136 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3137 }
3138 }
3139
3140 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3141 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3142 if (r < 0) {
3143 *exit_status = EXIT_NETWORK;
3144 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3145 }
3146 }
3147
3148 r = setup_input(context, params, socket_fd, named_iofds);
3149 if (r < 0) {
3150 *exit_status = EXIT_STDIN;
3151 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3152 }
3153
3154 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3155 if (r < 0) {
3156 *exit_status = EXIT_STDOUT;
3157 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3158 }
3159
3160 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3161 if (r < 0) {
3162 *exit_status = EXIT_STDERR;
3163 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3164 }
3165
3166 if (context->oom_score_adjust_set) {
3167 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3168 * prohibit write access to this file, and we shouldn't trip up over that. */
3169 r = set_oom_score_adjust(context->oom_score_adjust);
3170 if (IN_SET(r, -EPERM, -EACCES))
3171 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3172 else if (r < 0) {
3173 *exit_status = EXIT_OOM_ADJUST;
3174 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3175 }
3176 }
3177
3178 if (context->nice_set)
3179 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3180 *exit_status = EXIT_NICE;
3181 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3182 }
3183
3184 if (context->cpu_sched_set) {
3185 struct sched_param param = {
3186 .sched_priority = context->cpu_sched_priority,
3187 };
3188
3189 r = sched_setscheduler(0,
3190 context->cpu_sched_policy |
3191 (context->cpu_sched_reset_on_fork ?
3192 SCHED_RESET_ON_FORK : 0),
3193 &param);
3194 if (r < 0) {
3195 *exit_status = EXIT_SETSCHEDULER;
3196 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3197 }
3198 }
3199
3200 if (context->cpu_set.set)
3201 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3202 *exit_status = EXIT_CPUAFFINITY;
3203 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3204 }
3205
3206 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3207 r = apply_numa_policy(&context->numa_policy);
3208 if (r == -EOPNOTSUPP)
3209 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3210 else if (r < 0) {
3211 *exit_status = EXIT_NUMA_POLICY;
3212 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3213 }
3214 }
3215
3216 if (context->ioprio_set)
3217 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3218 *exit_status = EXIT_IOPRIO;
3219 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3220 }
3221
3222 if (context->timer_slack_nsec != NSEC_INFINITY)
3223 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3224 *exit_status = EXIT_TIMERSLACK;
3225 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3226 }
3227
3228 if (context->personality != PERSONALITY_INVALID) {
3229 r = safe_personality(context->personality);
3230 if (r < 0) {
3231 *exit_status = EXIT_PERSONALITY;
3232 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3233 }
3234 }
3235
3236 if (context->utmp_id)
3237 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3238 context->tty_path,
3239 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3240 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3241 USER_PROCESS,
3242 username);
3243
3244 if (uid_is_valid(uid)) {
3245 r = chown_terminal(STDIN_FILENO, uid);
3246 if (r < 0) {
3247 *exit_status = EXIT_STDIN;
3248 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3249 }
3250 }
3251
3252 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3253 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3254 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3255 * touch a single hierarchy too. */
3256 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3257 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3258 if (r < 0) {
3259 *exit_status = EXIT_CGROUP;
3260 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3261 }
3262 }
3263
3264 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3265 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3266 if (r < 0)
3267 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3268 }
3269
3270 r = build_environment(
3271 unit,
3272 context,
3273 params,
3274 n_fds,
3275 home,
3276 username,
3277 shell,
3278 journal_stream_dev,
3279 journal_stream_ino,
3280 &our_env);
3281 if (r < 0) {
3282 *exit_status = EXIT_MEMORY;
3283 return log_oom();
3284 }
3285
3286 r = build_pass_environment(context, &pass_env);
3287 if (r < 0) {
3288 *exit_status = EXIT_MEMORY;
3289 return log_oom();
3290 }
3291
3292 accum_env = strv_env_merge(5,
3293 params->environment,
3294 our_env,
3295 pass_env,
3296 context->environment,
3297 files_env,
3298 NULL);
3299 if (!accum_env) {
3300 *exit_status = EXIT_MEMORY;
3301 return log_oom();
3302 }
3303 accum_env = strv_env_clean(accum_env);
3304
3305 (void) umask(context->umask);
3306
3307 r = setup_keyring(unit, context, params, uid, gid);
3308 if (r < 0) {
3309 *exit_status = EXIT_KEYRING;
3310 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3311 }
3312
3313 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3314 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3315
3316 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3317 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3318
3319 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3320 if (needs_ambient_hack)
3321 needs_setuid = false;
3322 else
3323 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3324
3325 if (needs_sandboxing) {
3326 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3327 * present. The actual MAC context application will happen later, as late as possible, to avoid
3328 * impacting our own code paths. */
3329
3330 #if HAVE_SELINUX
3331 use_selinux = mac_selinux_use();
3332 #endif
3333 #if ENABLE_SMACK
3334 use_smack = mac_smack_use();
3335 #endif
3336 #if HAVE_APPARMOR
3337 use_apparmor = mac_apparmor_use();
3338 #endif
3339 }
3340
3341 if (needs_sandboxing) {
3342 int which_failed;
3343
3344 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3345 * is set here. (See below.) */
3346
3347 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3348 if (r < 0) {
3349 *exit_status = EXIT_LIMITS;
3350 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3351 }
3352 }
3353
3354 if (needs_setuid) {
3355
3356 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3357 * wins here. (See above.) */
3358
3359 if (context->pam_name && username) {
3360 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3361 if (r < 0) {
3362 *exit_status = EXIT_PAM;
3363 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3364 }
3365 }
3366 }
3367
3368 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3369
3370 if (ns_type_supported(NAMESPACE_NET)) {
3371 r = setup_netns(runtime->netns_storage_socket);
3372 if (r < 0) {
3373 *exit_status = EXIT_NETWORK;
3374 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3375 }
3376 } else if (context->network_namespace_path) {
3377 *exit_status = EXIT_NETWORK;
3378 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3379 } else
3380 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3381 }
3382
3383 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3384 if (needs_mount_namespace) {
3385 _cleanup_free_ char *error_path = NULL;
3386
3387 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3388 if (r < 0) {
3389 *exit_status = EXIT_NAMESPACE;
3390 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3391 error_path ? ": " : "", strempty(error_path));
3392 }
3393 }
3394
3395 if (context->protect_hostname) {
3396 if (ns_type_supported(NAMESPACE_UTS)) {
3397 if (unshare(CLONE_NEWUTS) < 0) {
3398 *exit_status = EXIT_NAMESPACE;
3399 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3400 }
3401 } else
3402 log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3403 #if HAVE_SECCOMP
3404 r = seccomp_protect_hostname();
3405 if (r < 0) {
3406 *exit_status = EXIT_SECCOMP;
3407 return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3408 }
3409 #endif
3410 }
3411
3412 /* Drop groups as early as possbile */
3413 if (needs_setuid) {
3414 r = enforce_groups(gid, supplementary_gids, ngids);
3415 if (r < 0) {
3416 *exit_status = EXIT_GROUP;
3417 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3418 }
3419 }
3420
3421 if (needs_sandboxing) {
3422 #if HAVE_SELINUX
3423 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3424 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3425 if (r < 0) {
3426 *exit_status = EXIT_SELINUX_CONTEXT;
3427 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3428 }
3429 }
3430 #endif
3431
3432 if (context->private_users) {
3433 r = setup_private_users(uid, gid);
3434 if (r < 0) {
3435 *exit_status = EXIT_USER;
3436 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3437 }
3438 }
3439 }
3440
3441 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3442 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3443 * however if we have it as we want to keep it open until the final execve(). */
3444
3445 if (params->exec_fd >= 0) {
3446 exec_fd = params->exec_fd;
3447
3448 if (exec_fd < 3 + (int) n_fds) {
3449 int moved_fd;
3450
3451 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3452 * process we are about to execute. */
3453
3454 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3455 if (moved_fd < 0) {
3456 *exit_status = EXIT_FDS;
3457 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3458 }
3459
3460 safe_close(exec_fd);
3461 exec_fd = moved_fd;
3462 } else {
3463 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3464 r = fd_cloexec(exec_fd, true);
3465 if (r < 0) {
3466 *exit_status = EXIT_FDS;
3467 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3468 }
3469 }
3470
3471 fds_with_exec_fd = newa(int, n_fds + 1);
3472 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3473 fds_with_exec_fd[n_fds] = exec_fd;
3474 n_fds_with_exec_fd = n_fds + 1;
3475 } else {
3476 fds_with_exec_fd = fds;
3477 n_fds_with_exec_fd = n_fds;
3478 }
3479
3480 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3481 if (r >= 0)
3482 r = shift_fds(fds, n_fds);
3483 if (r >= 0)
3484 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3485 if (r < 0) {
3486 *exit_status = EXIT_FDS;
3487 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3488 }
3489
3490 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3491 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3492 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3493 * came this far. */
3494
3495 secure_bits = context->secure_bits;
3496
3497 if (needs_sandboxing) {
3498 uint64_t bset;
3499
3500 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3501 * requested. (Note this is placed after the general resource limit initialization, see
3502 * above, in order to take precedence.) */
3503 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3504 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3505 *exit_status = EXIT_LIMITS;
3506 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3507 }
3508 }
3509
3510 #if ENABLE_SMACK
3511 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3512 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3513 if (use_smack) {
3514 r = setup_smack(context, command);
3515 if (r < 0) {
3516 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3517 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3518 }
3519 }
3520 #endif
3521
3522 bset = context->capability_bounding_set;
3523 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3524 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3525 * instead of us doing that */
3526 if (needs_ambient_hack)
3527 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3528 (UINT64_C(1) << CAP_SETUID) |
3529 (UINT64_C(1) << CAP_SETGID);
3530
3531 if (!cap_test_all(bset)) {
3532 r = capability_bounding_set_drop(bset, false);
3533 if (r < 0) {
3534 *exit_status = EXIT_CAPABILITIES;
3535 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3536 }
3537 }
3538
3539 /* This is done before enforce_user, but ambient set
3540 * does not survive over setresuid() if keep_caps is not set. */
3541 if (!needs_ambient_hack &&
3542 context->capability_ambient_set != 0) {
3543 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3544 if (r < 0) {
3545 *exit_status = EXIT_CAPABILITIES;
3546 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3547 }
3548 }
3549 }
3550
3551 /* chroot to root directory first, before we lose the ability to chroot */
3552 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3553 if (r < 0)
3554 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3555
3556 if (needs_setuid) {
3557 if (uid_is_valid(uid)) {
3558 r = enforce_user(context, uid);
3559 if (r < 0) {
3560 *exit_status = EXIT_USER;
3561 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3562 }
3563
3564 if (!needs_ambient_hack &&
3565 context->capability_ambient_set != 0) {
3566
3567 /* Fix the ambient capabilities after user change. */
3568 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3569 if (r < 0) {
3570 *exit_status = EXIT_CAPABILITIES;
3571 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3572 }
3573
3574 /* If we were asked to change user and ambient capabilities
3575 * were requested, we had to add keep-caps to the securebits
3576 * so that we would maintain the inherited capability set
3577 * through the setresuid(). Make sure that the bit is added
3578 * also to the context secure_bits so that we don't try to
3579 * drop the bit away next. */
3580
3581 secure_bits |= 1<<SECURE_KEEP_CAPS;
3582 }
3583 }
3584 }
3585
3586 /* Apply working directory here, because the working directory might be on NFS and only the user running
3587 * this service might have the correct privilege to change to the working directory */
3588 r = apply_working_directory(context, params, home, exit_status);
3589 if (r < 0)
3590 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3591
3592 if (needs_sandboxing) {
3593 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3594 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3595 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3596 * are restricted. */
3597
3598 #if HAVE_SELINUX
3599 if (use_selinux) {
3600 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3601
3602 if (exec_context) {
3603 r = setexeccon(exec_context);
3604 if (r < 0) {
3605 *exit_status = EXIT_SELINUX_CONTEXT;
3606 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3607 }
3608 }
3609 }
3610 #endif
3611
3612 #if HAVE_APPARMOR
3613 if (use_apparmor && context->apparmor_profile) {
3614 r = aa_change_onexec(context->apparmor_profile);
3615 if (r < 0 && !context->apparmor_profile_ignore) {
3616 *exit_status = EXIT_APPARMOR_PROFILE;
3617 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3618 }
3619 }
3620 #endif
3621
3622 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3623 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3624 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3625 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3626 *exit_status = EXIT_SECUREBITS;
3627 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3628 }
3629
3630 if (context_has_no_new_privileges(context))
3631 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3632 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3633 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3634 }
3635
3636 #if HAVE_SECCOMP
3637 r = apply_address_families(unit, context);
3638 if (r < 0) {
3639 *exit_status = EXIT_ADDRESS_FAMILIES;
3640 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3641 }
3642
3643 r = apply_memory_deny_write_execute(unit, context);
3644 if (r < 0) {
3645 *exit_status = EXIT_SECCOMP;
3646 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3647 }
3648
3649 r = apply_restrict_realtime(unit, context);
3650 if (r < 0) {
3651 *exit_status = EXIT_SECCOMP;
3652 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3653 }
3654
3655 r = apply_restrict_suid_sgid(unit, context);
3656 if (r < 0) {
3657 *exit_status = EXIT_SECCOMP;
3658 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3659 }
3660
3661 r = apply_restrict_namespaces(unit, context);
3662 if (r < 0) {
3663 *exit_status = EXIT_SECCOMP;
3664 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3665 }
3666
3667 r = apply_protect_sysctl(unit, context);
3668 if (r < 0) {
3669 *exit_status = EXIT_SECCOMP;
3670 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3671 }
3672
3673 r = apply_protect_kernel_modules(unit, context);
3674 if (r < 0) {
3675 *exit_status = EXIT_SECCOMP;
3676 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3677 }
3678
3679 r = apply_private_devices(unit, context);
3680 if (r < 0) {
3681 *exit_status = EXIT_SECCOMP;
3682 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3683 }
3684
3685 r = apply_syscall_archs(unit, context);
3686 if (r < 0) {
3687 *exit_status = EXIT_SECCOMP;
3688 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3689 }
3690
3691 r = apply_lock_personality(unit, context);
3692 if (r < 0) {
3693 *exit_status = EXIT_SECCOMP;
3694 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3695 }
3696
3697 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3698 * by the filter as little as possible. */
3699 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3700 if (r < 0) {
3701 *exit_status = EXIT_SECCOMP;
3702 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3703 }
3704 #endif
3705 }
3706
3707 if (!strv_isempty(context->unset_environment)) {
3708 char **ee = NULL;
3709
3710 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3711 if (!ee) {
3712 *exit_status = EXIT_MEMORY;
3713 return log_oom();
3714 }
3715
3716 strv_free_and_replace(accum_env, ee);
3717 }
3718
3719 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3720 replaced_argv = replace_env_argv(command->argv, accum_env);
3721 if (!replaced_argv) {
3722 *exit_status = EXIT_MEMORY;
3723 return log_oom();
3724 }
3725 final_argv = replaced_argv;
3726 } else
3727 final_argv = command->argv;
3728
3729 if (DEBUG_LOGGING) {
3730 _cleanup_free_ char *line;
3731
3732 line = exec_command_line(final_argv);
3733 if (line)
3734 log_struct(LOG_DEBUG,
3735 "EXECUTABLE=%s", command->path,
3736 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3737 LOG_UNIT_ID(unit),
3738 LOG_UNIT_INVOCATION_ID(unit));
3739 }
3740
3741 if (exec_fd >= 0) {
3742 uint8_t hot = 1;
3743
3744 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3745 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3746
3747 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3748 *exit_status = EXIT_EXEC;
3749 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3750 }
3751 }
3752
3753 execve(command->path, final_argv, accum_env);
3754 r = -errno;
3755
3756 if (exec_fd >= 0) {
3757 uint8_t hot = 0;
3758
3759 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3760 * that POLLHUP on it no longer means execve() succeeded. */
3761
3762 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3763 *exit_status = EXIT_EXEC;
3764 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3765 }
3766 }
3767
3768 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3769 log_struct_errno(LOG_INFO, r,
3770 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3771 LOG_UNIT_ID(unit),
3772 LOG_UNIT_INVOCATION_ID(unit),
3773 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3774 command->path),
3775 "EXECUTABLE=%s", command->path);
3776 return 0;
3777 }
3778
3779 *exit_status = EXIT_EXEC;
3780 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3781 }
3782
3783 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3784 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3785
3786 int exec_spawn(Unit *unit,
3787 ExecCommand *command,
3788 const ExecContext *context,
3789 const ExecParameters *params,
3790 ExecRuntime *runtime,
3791 DynamicCreds *dcreds,
3792 pid_t *ret) {
3793
3794 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3795 _cleanup_free_ char *subcgroup_path = NULL;
3796 _cleanup_strv_free_ char **files_env = NULL;
3797 size_t n_storage_fds = 0, n_socket_fds = 0;
3798 _cleanup_free_ char *line = NULL;
3799 pid_t pid;
3800
3801 assert(unit);
3802 assert(command);
3803 assert(context);
3804 assert(ret);
3805 assert(params);
3806 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3807
3808 if (context->std_input == EXEC_INPUT_SOCKET ||
3809 context->std_output == EXEC_OUTPUT_SOCKET ||
3810 context->std_error == EXEC_OUTPUT_SOCKET) {
3811
3812 if (params->n_socket_fds > 1) {
3813 log_unit_error(unit, "Got more than one socket.");
3814 return -EINVAL;
3815 }
3816
3817 if (params->n_socket_fds == 0) {
3818 log_unit_error(unit, "Got no socket.");
3819 return -EINVAL;
3820 }
3821
3822 socket_fd = params->fds[0];
3823 } else {
3824 socket_fd = -1;
3825 fds = params->fds;
3826 n_socket_fds = params->n_socket_fds;
3827 n_storage_fds = params->n_storage_fds;
3828 }
3829
3830 r = exec_context_named_iofds(context, params, named_iofds);
3831 if (r < 0)
3832 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3833
3834 r = exec_context_load_environment(unit, context, &files_env);
3835 if (r < 0)
3836 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3837
3838 line = exec_command_line(command->argv);
3839 if (!line)
3840 return log_oom();
3841
3842 log_struct(LOG_DEBUG,
3843 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3844 "EXECUTABLE=%s", command->path,
3845 LOG_UNIT_ID(unit),
3846 LOG_UNIT_INVOCATION_ID(unit));
3847
3848 if (params->cgroup_path) {
3849 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3850 if (r < 0)
3851 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3852 if (r > 0) { /* We are using a child cgroup */
3853 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3854 if (r < 0)
3855 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3856 }
3857 }
3858
3859 pid = fork();
3860 if (pid < 0)
3861 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3862
3863 if (pid == 0) {
3864 int exit_status = EXIT_SUCCESS;
3865
3866 r = exec_child(unit,
3867 command,
3868 context,
3869 params,
3870 runtime,
3871 dcreds,
3872 socket_fd,
3873 named_iofds,
3874 fds,
3875 n_socket_fds,
3876 n_storage_fds,
3877 files_env,
3878 unit->manager->user_lookup_fds[1],
3879 &exit_status);
3880
3881 if (r < 0) {
3882 const char *status =
3883 exit_status_to_string(exit_status,
3884 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
3885
3886 log_struct_errno(LOG_ERR, r,
3887 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3888 LOG_UNIT_ID(unit),
3889 LOG_UNIT_INVOCATION_ID(unit),
3890 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3891 status, command->path),
3892 "EXECUTABLE=%s", command->path);
3893 }
3894
3895 _exit(exit_status);
3896 }
3897
3898 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3899
3900 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3901 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3902 * process will be killed too). */
3903 if (subcgroup_path)
3904 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3905
3906 exec_status_start(&command->exec_status, pid);
3907
3908 *ret = pid;
3909 return 0;
3910 }
3911
3912 void exec_context_init(ExecContext *c) {
3913 ExecDirectoryType i;
3914
3915 assert(c);
3916
3917 c->umask = 0022;
3918 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3919 c->cpu_sched_policy = SCHED_OTHER;
3920 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3921 c->syslog_level_prefix = true;
3922 c->ignore_sigpipe = true;
3923 c->timer_slack_nsec = NSEC_INFINITY;
3924 c->personality = PERSONALITY_INVALID;
3925 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3926 c->directories[i].mode = 0755;
3927 c->capability_bounding_set = CAP_ALL;
3928 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3929 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3930 c->log_level_max = -1;
3931 numa_policy_reset(&c->numa_policy);
3932 }
3933
3934 void exec_context_done(ExecContext *c) {
3935 ExecDirectoryType i;
3936 size_t l;
3937
3938 assert(c);
3939
3940 c->environment = strv_free(c->environment);
3941 c->environment_files = strv_free(c->environment_files);
3942 c->pass_environment = strv_free(c->pass_environment);
3943 c->unset_environment = strv_free(c->unset_environment);
3944
3945 rlimit_free_all(c->rlimit);
3946
3947 for (l = 0; l < 3; l++) {
3948 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3949 c->stdio_file[l] = mfree(c->stdio_file[l]);
3950 }
3951
3952 c->working_directory = mfree(c->working_directory);
3953 c->root_directory = mfree(c->root_directory);
3954 c->root_image = mfree(c->root_image);
3955 c->tty_path = mfree(c->tty_path);
3956 c->syslog_identifier = mfree(c->syslog_identifier);
3957 c->user = mfree(c->user);
3958 c->group = mfree(c->group);
3959
3960 c->supplementary_groups = strv_free(c->supplementary_groups);
3961
3962 c->pam_name = mfree(c->pam_name);
3963
3964 c->read_only_paths = strv_free(c->read_only_paths);
3965 c->read_write_paths = strv_free(c->read_write_paths);
3966 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3967
3968 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3969 c->bind_mounts = NULL;
3970 c->n_bind_mounts = 0;
3971 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3972 c->temporary_filesystems = NULL;
3973 c->n_temporary_filesystems = 0;
3974
3975 cpu_set_reset(&c->cpu_set);
3976 numa_policy_reset(&c->numa_policy);
3977
3978 c->utmp_id = mfree(c->utmp_id);
3979 c->selinux_context = mfree(c->selinux_context);
3980 c->apparmor_profile = mfree(c->apparmor_profile);
3981 c->smack_process_label = mfree(c->smack_process_label);
3982
3983 c->syscall_filter = hashmap_free(c->syscall_filter);
3984 c->syscall_archs = set_free(c->syscall_archs);
3985 c->address_families = set_free(c->address_families);
3986
3987 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3988 c->directories[i].paths = strv_free(c->directories[i].paths);
3989
3990 c->log_level_max = -1;
3991
3992 exec_context_free_log_extra_fields(c);
3993
3994 c->log_rate_limit_interval_usec = 0;
3995 c->log_rate_limit_burst = 0;
3996
3997 c->stdin_data = mfree(c->stdin_data);
3998 c->stdin_data_size = 0;
3999
4000 c->network_namespace_path = mfree(c->network_namespace_path);
4001 }
4002
4003 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4004 char **i;
4005
4006 assert(c);
4007
4008 if (!runtime_prefix)
4009 return 0;
4010
4011 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4012 _cleanup_free_ char *p;
4013
4014 p = path_join(runtime_prefix, *i);
4015 if (!p)
4016 return -ENOMEM;
4017
4018 /* We execute this synchronously, since we need to be sure this is gone when we start the
4019 * service next. */
4020 (void) rm_rf(p, REMOVE_ROOT);
4021 }
4022
4023 return 0;
4024 }
4025
4026 static void exec_command_done(ExecCommand *c) {
4027 assert(c);
4028
4029 c->path = mfree(c->path);
4030 c->argv = strv_free(c->argv);
4031 }
4032
4033 void exec_command_done_array(ExecCommand *c, size_t n) {
4034 size_t i;
4035
4036 for (i = 0; i < n; i++)
4037 exec_command_done(c+i);
4038 }
4039
4040 ExecCommand* exec_command_free_list(ExecCommand *c) {
4041 ExecCommand *i;
4042
4043 while ((i = c)) {
4044 LIST_REMOVE(command, c, i);
4045 exec_command_done(i);
4046 free(i);
4047 }
4048
4049 return NULL;
4050 }
4051
4052 void exec_command_free_array(ExecCommand **c, size_t n) {
4053 size_t i;
4054
4055 for (i = 0; i < n; i++)
4056 c[i] = exec_command_free_list(c[i]);
4057 }
4058
4059 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4060 size_t i;
4061
4062 for (i = 0; i < n; i++)
4063 exec_status_reset(&c[i].exec_status);
4064 }
4065
4066 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4067 size_t i;
4068
4069 for (i = 0; i < n; i++) {
4070 ExecCommand *z;
4071
4072 LIST_FOREACH(command, z, c[i])
4073 exec_status_reset(&z->exec_status);
4074 }
4075 }
4076
4077 typedef struct InvalidEnvInfo {
4078 const Unit *unit;
4079 const char *path;
4080 } InvalidEnvInfo;
4081
4082 static void invalid_env(const char *p, void *userdata) {
4083 InvalidEnvInfo *info = userdata;
4084
4085 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4086 }
4087
4088 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4089 assert(c);
4090
4091 switch (fd_index) {
4092
4093 case STDIN_FILENO:
4094 if (c->std_input != EXEC_INPUT_NAMED_FD)
4095 return NULL;
4096
4097 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4098
4099 case STDOUT_FILENO:
4100 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4101 return NULL;
4102
4103 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4104
4105 case STDERR_FILENO:
4106 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4107 return NULL;
4108
4109 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4110
4111 default:
4112 return NULL;
4113 }
4114 }
4115
4116 static int exec_context_named_iofds(
4117 const ExecContext *c,
4118 const ExecParameters *p,
4119 int named_iofds[static 3]) {
4120
4121 size_t i, targets;
4122 const char* stdio_fdname[3];
4123 size_t n_fds;
4124
4125 assert(c);
4126 assert(p);
4127 assert(named_iofds);
4128
4129 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4130 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4131 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4132
4133 for (i = 0; i < 3; i++)
4134 stdio_fdname[i] = exec_context_fdname(c, i);
4135
4136 n_fds = p->n_storage_fds + p->n_socket_fds;
4137
4138 for (i = 0; i < n_fds && targets > 0; i++)
4139 if (named_iofds[STDIN_FILENO] < 0 &&
4140 c->std_input == EXEC_INPUT_NAMED_FD &&
4141 stdio_fdname[STDIN_FILENO] &&
4142 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4143
4144 named_iofds[STDIN_FILENO] = p->fds[i];
4145 targets--;
4146
4147 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4148 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4149 stdio_fdname[STDOUT_FILENO] &&
4150 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4151
4152 named_iofds[STDOUT_FILENO] = p->fds[i];
4153 targets--;
4154
4155 } else if (named_iofds[STDERR_FILENO] < 0 &&
4156 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4157 stdio_fdname[STDERR_FILENO] &&
4158 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4159
4160 named_iofds[STDERR_FILENO] = p->fds[i];
4161 targets--;
4162 }
4163
4164 return targets == 0 ? 0 : -ENOENT;
4165 }
4166
4167 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4168 char **i, **r = NULL;
4169
4170 assert(c);
4171 assert(l);
4172
4173 STRV_FOREACH(i, c->environment_files) {
4174 char *fn;
4175 int k;
4176 unsigned n;
4177 bool ignore = false;
4178 char **p;
4179 _cleanup_globfree_ glob_t pglob = {};
4180
4181 fn = *i;
4182
4183 if (fn[0] == '-') {
4184 ignore = true;
4185 fn++;
4186 }
4187
4188 if (!path_is_absolute(fn)) {
4189 if (ignore)
4190 continue;
4191
4192 strv_free(r);
4193 return -EINVAL;
4194 }
4195
4196 /* Filename supports globbing, take all matching files */
4197 k = safe_glob(fn, 0, &pglob);
4198 if (k < 0) {
4199 if (ignore)
4200 continue;
4201
4202 strv_free(r);
4203 return k;
4204 }
4205
4206 /* When we don't match anything, -ENOENT should be returned */
4207 assert(pglob.gl_pathc > 0);
4208
4209 for (n = 0; n < pglob.gl_pathc; n++) {
4210 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4211 if (k < 0) {
4212 if (ignore)
4213 continue;
4214
4215 strv_free(r);
4216 return k;
4217 }
4218 /* Log invalid environment variables with filename */
4219 if (p) {
4220 InvalidEnvInfo info = {
4221 .unit = unit,
4222 .path = pglob.gl_pathv[n]
4223 };
4224
4225 p = strv_env_clean_with_callback(p, invalid_env, &info);
4226 }
4227
4228 if (!r)
4229 r = p;
4230 else {
4231 char **m;
4232
4233 m = strv_env_merge(2, r, p);
4234 strv_free(r);
4235 strv_free(p);
4236 if (!m)
4237 return -ENOMEM;
4238
4239 r = m;
4240 }
4241 }
4242 }
4243
4244 *l = r;
4245
4246 return 0;
4247 }
4248
4249 static bool tty_may_match_dev_console(const char *tty) {
4250 _cleanup_free_ char *resolved = NULL;
4251
4252 if (!tty)
4253 return true;
4254
4255 tty = skip_dev_prefix(tty);
4256
4257 /* trivial identity? */
4258 if (streq(tty, "console"))
4259 return true;
4260
4261 if (resolve_dev_console(&resolved) < 0)
4262 return true; /* if we could not resolve, assume it may */
4263
4264 /* "tty0" means the active VC, so it may be the same sometimes */
4265 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4266 }
4267
4268 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4269 assert(ec);
4270
4271 return ec->tty_reset ||
4272 ec->tty_vhangup ||
4273 ec->tty_vt_disallocate ||
4274 is_terminal_input(ec->std_input) ||
4275 is_terminal_output(ec->std_output) ||
4276 is_terminal_output(ec->std_error);
4277 }
4278
4279 bool exec_context_may_touch_console(const ExecContext *ec) {
4280
4281 return exec_context_may_touch_tty(ec) &&
4282 tty_may_match_dev_console(exec_context_tty_path(ec));
4283 }
4284
4285 static void strv_fprintf(FILE *f, char **l) {
4286 char **g;
4287
4288 assert(f);
4289
4290 STRV_FOREACH(g, l)
4291 fprintf(f, " %s", *g);
4292 }
4293
4294 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4295 ExecDirectoryType dt;
4296 char **e, **d;
4297 unsigned i;
4298 int r;
4299
4300 assert(c);
4301 assert(f);
4302
4303 prefix = strempty(prefix);
4304
4305 fprintf(f,
4306 "%sUMask: %04o\n"
4307 "%sWorkingDirectory: %s\n"
4308 "%sRootDirectory: %s\n"
4309 "%sNonBlocking: %s\n"
4310 "%sPrivateTmp: %s\n"
4311 "%sPrivateDevices: %s\n"
4312 "%sProtectKernelTunables: %s\n"
4313 "%sProtectKernelModules: %s\n"
4314 "%sProtectControlGroups: %s\n"
4315 "%sPrivateNetwork: %s\n"
4316 "%sPrivateUsers: %s\n"
4317 "%sProtectHome: %s\n"
4318 "%sProtectSystem: %s\n"
4319 "%sMountAPIVFS: %s\n"
4320 "%sIgnoreSIGPIPE: %s\n"
4321 "%sMemoryDenyWriteExecute: %s\n"
4322 "%sRestrictRealtime: %s\n"
4323 "%sRestrictSUIDSGID: %s\n"
4324 "%sKeyringMode: %s\n"
4325 "%sProtectHostname: %s\n",
4326 prefix, c->umask,
4327 prefix, c->working_directory ? c->working_directory : "/",
4328 prefix, c->root_directory ? c->root_directory : "/",
4329 prefix, yes_no(c->non_blocking),
4330 prefix, yes_no(c->private_tmp),
4331 prefix, yes_no(c->private_devices),
4332 prefix, yes_no(c->protect_kernel_tunables),
4333 prefix, yes_no(c->protect_kernel_modules),
4334 prefix, yes_no(c->protect_control_groups),
4335 prefix, yes_no(c->private_network),
4336 prefix, yes_no(c->private_users),
4337 prefix, protect_home_to_string(c->protect_home),
4338 prefix, protect_system_to_string(c->protect_system),
4339 prefix, yes_no(c->mount_apivfs),
4340 prefix, yes_no(c->ignore_sigpipe),
4341 prefix, yes_no(c->memory_deny_write_execute),
4342 prefix, yes_no(c->restrict_realtime),
4343 prefix, yes_no(c->restrict_suid_sgid),
4344 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4345 prefix, yes_no(c->protect_hostname));
4346
4347 if (c->root_image)
4348 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4349
4350 STRV_FOREACH(e, c->environment)
4351 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4352
4353 STRV_FOREACH(e, c->environment_files)
4354 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4355
4356 STRV_FOREACH(e, c->pass_environment)
4357 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4358
4359 STRV_FOREACH(e, c->unset_environment)
4360 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4361
4362 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4363
4364 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4365 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4366
4367 STRV_FOREACH(d, c->directories[dt].paths)
4368 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4369 }
4370
4371 if (c->nice_set)
4372 fprintf(f,
4373 "%sNice: %i\n",
4374 prefix, c->nice);
4375
4376 if (c->oom_score_adjust_set)
4377 fprintf(f,
4378 "%sOOMScoreAdjust: %i\n",
4379 prefix, c->oom_score_adjust);
4380
4381 for (i = 0; i < RLIM_NLIMITS; i++)
4382 if (c->rlimit[i]) {
4383 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4384 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4385 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4386 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4387 }
4388
4389 if (c->ioprio_set) {
4390 _cleanup_free_ char *class_str = NULL;
4391
4392 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4393 if (r >= 0)
4394 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4395
4396 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4397 }
4398
4399 if (c->cpu_sched_set) {
4400 _cleanup_free_ char *policy_str = NULL;
4401
4402 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4403 if (r >= 0)
4404 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4405
4406 fprintf(f,
4407 "%sCPUSchedulingPriority: %i\n"
4408 "%sCPUSchedulingResetOnFork: %s\n",
4409 prefix, c->cpu_sched_priority,
4410 prefix, yes_no(c->cpu_sched_reset_on_fork));
4411 }
4412
4413 if (c->cpu_set.set) {
4414 _cleanup_free_ char *affinity = NULL;
4415
4416 affinity = cpu_set_to_range_string(&c->cpu_set);
4417 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4418 }
4419
4420 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4421 _cleanup_free_ char *nodes = NULL;
4422
4423 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4424 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4425 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4426 }
4427
4428 if (c->timer_slack_nsec != NSEC_INFINITY)
4429 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4430
4431 fprintf(f,
4432 "%sStandardInput: %s\n"
4433 "%sStandardOutput: %s\n"
4434 "%sStandardError: %s\n",
4435 prefix, exec_input_to_string(c->std_input),
4436 prefix, exec_output_to_string(c->std_output),
4437 prefix, exec_output_to_string(c->std_error));
4438
4439 if (c->std_input == EXEC_INPUT_NAMED_FD)
4440 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4441 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4442 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4443 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4444 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4445
4446 if (c->std_input == EXEC_INPUT_FILE)
4447 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4448 if (c->std_output == EXEC_OUTPUT_FILE)
4449 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4450 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4451 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4452 if (c->std_error == EXEC_OUTPUT_FILE)
4453 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4454 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4455 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4456
4457 if (c->tty_path)
4458 fprintf(f,
4459 "%sTTYPath: %s\n"
4460 "%sTTYReset: %s\n"
4461 "%sTTYVHangup: %s\n"
4462 "%sTTYVTDisallocate: %s\n",
4463 prefix, c->tty_path,
4464 prefix, yes_no(c->tty_reset),
4465 prefix, yes_no(c->tty_vhangup),
4466 prefix, yes_no(c->tty_vt_disallocate));
4467
4468 if (IN_SET(c->std_output,
4469 EXEC_OUTPUT_SYSLOG,
4470 EXEC_OUTPUT_KMSG,
4471 EXEC_OUTPUT_JOURNAL,
4472 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4473 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4474 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4475 IN_SET(c->std_error,
4476 EXEC_OUTPUT_SYSLOG,
4477 EXEC_OUTPUT_KMSG,
4478 EXEC_OUTPUT_JOURNAL,
4479 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4480 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4481 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4482
4483 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4484
4485 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4486 if (r >= 0)
4487 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4488
4489 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4490 if (r >= 0)
4491 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4492 }
4493
4494 if (c->log_level_max >= 0) {
4495 _cleanup_free_ char *t = NULL;
4496
4497 (void) log_level_to_string_alloc(c->log_level_max, &t);
4498
4499 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4500 }
4501
4502 if (c->log_rate_limit_interval_usec > 0) {
4503 char buf_timespan[FORMAT_TIMESPAN_MAX];
4504
4505 fprintf(f,
4506 "%sLogRateLimitIntervalSec: %s\n",
4507 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4508 }
4509
4510 if (c->log_rate_limit_burst > 0)
4511 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4512
4513 if (c->n_log_extra_fields > 0) {
4514 size_t j;
4515
4516 for (j = 0; j < c->n_log_extra_fields; j++) {
4517 fprintf(f, "%sLogExtraFields: ", prefix);
4518 fwrite(c->log_extra_fields[j].iov_base,
4519 1, c->log_extra_fields[j].iov_len,
4520 f);
4521 fputc('\n', f);
4522 }
4523 }
4524
4525 if (c->secure_bits) {
4526 _cleanup_free_ char *str = NULL;
4527
4528 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4529 if (r >= 0)
4530 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4531 }
4532
4533 if (c->capability_bounding_set != CAP_ALL) {
4534 _cleanup_free_ char *str = NULL;
4535
4536 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4537 if (r >= 0)
4538 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4539 }
4540
4541 if (c->capability_ambient_set != 0) {
4542 _cleanup_free_ char *str = NULL;
4543
4544 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4545 if (r >= 0)
4546 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4547 }
4548
4549 if (c->user)
4550 fprintf(f, "%sUser: %s\n", prefix, c->user);
4551 if (c->group)
4552 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4553
4554 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4555
4556 if (!strv_isempty(c->supplementary_groups)) {
4557 fprintf(f, "%sSupplementaryGroups:", prefix);
4558 strv_fprintf(f, c->supplementary_groups);
4559 fputs("\n", f);
4560 }
4561
4562 if (c->pam_name)
4563 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4564
4565 if (!strv_isempty(c->read_write_paths)) {
4566 fprintf(f, "%sReadWritePaths:", prefix);
4567 strv_fprintf(f, c->read_write_paths);
4568 fputs("\n", f);
4569 }
4570
4571 if (!strv_isempty(c->read_only_paths)) {
4572 fprintf(f, "%sReadOnlyPaths:", prefix);
4573 strv_fprintf(f, c->read_only_paths);
4574 fputs("\n", f);
4575 }
4576
4577 if (!strv_isempty(c->inaccessible_paths)) {
4578 fprintf(f, "%sInaccessiblePaths:", prefix);
4579 strv_fprintf(f, c->inaccessible_paths);
4580 fputs("\n", f);
4581 }
4582
4583 if (c->n_bind_mounts > 0)
4584 for (i = 0; i < c->n_bind_mounts; i++)
4585 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4586 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4587 c->bind_mounts[i].ignore_enoent ? "-": "",
4588 c->bind_mounts[i].source,
4589 c->bind_mounts[i].destination,
4590 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4591
4592 if (c->n_temporary_filesystems > 0)
4593 for (i = 0; i < c->n_temporary_filesystems; i++) {
4594 TemporaryFileSystem *t = c->temporary_filesystems + i;
4595
4596 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4597 t->path,
4598 isempty(t->options) ? "" : ":",
4599 strempty(t->options));
4600 }
4601
4602 if (c->utmp_id)
4603 fprintf(f,
4604 "%sUtmpIdentifier: %s\n",
4605 prefix, c->utmp_id);
4606
4607 if (c->selinux_context)
4608 fprintf(f,
4609 "%sSELinuxContext: %s%s\n",
4610 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4611
4612 if (c->apparmor_profile)
4613 fprintf(f,
4614 "%sAppArmorProfile: %s%s\n",
4615 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4616
4617 if (c->smack_process_label)
4618 fprintf(f,
4619 "%sSmackProcessLabel: %s%s\n",
4620 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4621
4622 if (c->personality != PERSONALITY_INVALID)
4623 fprintf(f,
4624 "%sPersonality: %s\n",
4625 prefix, strna(personality_to_string(c->personality)));
4626
4627 fprintf(f,
4628 "%sLockPersonality: %s\n",
4629 prefix, yes_no(c->lock_personality));
4630
4631 if (c->syscall_filter) {
4632 #if HAVE_SECCOMP
4633 Iterator j;
4634 void *id, *val;
4635 bool first = true;
4636 #endif
4637
4638 fprintf(f,
4639 "%sSystemCallFilter: ",
4640 prefix);
4641
4642 if (!c->syscall_whitelist)
4643 fputc('~', f);
4644
4645 #if HAVE_SECCOMP
4646 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4647 _cleanup_free_ char *name = NULL;
4648 const char *errno_name = NULL;
4649 int num = PTR_TO_INT(val);
4650
4651 if (first)
4652 first = false;
4653 else
4654 fputc(' ', f);
4655
4656 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4657 fputs(strna(name), f);
4658
4659 if (num >= 0) {
4660 errno_name = errno_to_name(num);
4661 if (errno_name)
4662 fprintf(f, ":%s", errno_name);
4663 else
4664 fprintf(f, ":%d", num);
4665 }
4666 }
4667 #endif
4668
4669 fputc('\n', f);
4670 }
4671
4672 if (c->syscall_archs) {
4673 #if HAVE_SECCOMP
4674 Iterator j;
4675 void *id;
4676 #endif
4677
4678 fprintf(f,
4679 "%sSystemCallArchitectures:",
4680 prefix);
4681
4682 #if HAVE_SECCOMP
4683 SET_FOREACH(id, c->syscall_archs, j)
4684 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4685 #endif
4686 fputc('\n', f);
4687 }
4688
4689 if (exec_context_restrict_namespaces_set(c)) {
4690 _cleanup_free_ char *s = NULL;
4691
4692 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4693 if (r >= 0)
4694 fprintf(f, "%sRestrictNamespaces: %s\n",
4695 prefix, s);
4696 }
4697
4698 if (c->network_namespace_path)
4699 fprintf(f,
4700 "%sNetworkNamespacePath: %s\n",
4701 prefix, c->network_namespace_path);
4702
4703 if (c->syscall_errno > 0) {
4704 const char *errno_name;
4705
4706 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4707
4708 errno_name = errno_to_name(c->syscall_errno);
4709 if (errno_name)
4710 fprintf(f, "%s\n", errno_name);
4711 else
4712 fprintf(f, "%d\n", c->syscall_errno);
4713 }
4714 }
4715
4716 bool exec_context_maintains_privileges(const ExecContext *c) {
4717 assert(c);
4718
4719 /* Returns true if the process forked off would run under
4720 * an unchanged UID or as root. */
4721
4722 if (!c->user)
4723 return true;
4724
4725 if (streq(c->user, "root") || streq(c->user, "0"))
4726 return true;
4727
4728 return false;
4729 }
4730
4731 int exec_context_get_effective_ioprio(const ExecContext *c) {
4732 int p;
4733
4734 assert(c);
4735
4736 if (c->ioprio_set)
4737 return c->ioprio;
4738
4739 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4740 if (p < 0)
4741 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4742
4743 return p;
4744 }
4745
4746 void exec_context_free_log_extra_fields(ExecContext *c) {
4747 size_t l;
4748
4749 assert(c);
4750
4751 for (l = 0; l < c->n_log_extra_fields; l++)
4752 free(c->log_extra_fields[l].iov_base);
4753 c->log_extra_fields = mfree(c->log_extra_fields);
4754 c->n_log_extra_fields = 0;
4755 }
4756
4757 void exec_context_revert_tty(ExecContext *c) {
4758 int r;
4759
4760 assert(c);
4761
4762 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4763 exec_context_tty_reset(c, NULL);
4764
4765 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4766 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4767 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4768
4769 if (exec_context_may_touch_tty(c)) {
4770 const char *path;
4771
4772 path = exec_context_tty_path(c);
4773 if (path) {
4774 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4775 if (r < 0 && r != -ENOENT)
4776 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4777 }
4778 }
4779 }
4780
4781 int exec_context_get_clean_directories(
4782 ExecContext *c,
4783 char **prefix,
4784 ExecCleanMask mask,
4785 char ***ret) {
4786
4787 _cleanup_strv_free_ char **l = NULL;
4788 ExecDirectoryType t;
4789 int r;
4790
4791 assert(c);
4792 assert(prefix);
4793 assert(ret);
4794
4795 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4796 char **i;
4797
4798 if (!FLAGS_SET(mask, 1U << t))
4799 continue;
4800
4801 if (!prefix[t])
4802 continue;
4803
4804 STRV_FOREACH(i, c->directories[t].paths) {
4805 char *j;
4806
4807 j = path_join(prefix[t], *i);
4808 if (!j)
4809 return -ENOMEM;
4810
4811 r = strv_consume(&l, j);
4812 if (r < 0)
4813 return r;
4814 }
4815 }
4816
4817 *ret = TAKE_PTR(l);
4818 return 0;
4819 }
4820
4821 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
4822 ExecCleanMask mask = 0;
4823
4824 assert(c);
4825 assert(ret);
4826
4827 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4828 if (!strv_isempty(c->directories[t].paths))
4829 mask |= 1U << t;
4830
4831 *ret = mask;
4832 return 0;
4833 }
4834
4835 void exec_status_start(ExecStatus *s, pid_t pid) {
4836 assert(s);
4837
4838 *s = (ExecStatus) {
4839 .pid = pid,
4840 };
4841
4842 dual_timestamp_get(&s->start_timestamp);
4843 }
4844
4845 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4846 assert(s);
4847
4848 if (s->pid != pid) {
4849 *s = (ExecStatus) {
4850 .pid = pid,
4851 };
4852 }
4853
4854 dual_timestamp_get(&s->exit_timestamp);
4855
4856 s->code = code;
4857 s->status = status;
4858
4859 if (context && context->utmp_id)
4860 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4861 }
4862
4863 void exec_status_reset(ExecStatus *s) {
4864 assert(s);
4865
4866 *s = (ExecStatus) {};
4867 }
4868
4869 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4870 char buf[FORMAT_TIMESTAMP_MAX];
4871
4872 assert(s);
4873 assert(f);
4874
4875 if (s->pid <= 0)
4876 return;
4877
4878 prefix = strempty(prefix);
4879
4880 fprintf(f,
4881 "%sPID: "PID_FMT"\n",
4882 prefix, s->pid);
4883
4884 if (dual_timestamp_is_set(&s->start_timestamp))
4885 fprintf(f,
4886 "%sStart Timestamp: %s\n",
4887 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4888
4889 if (dual_timestamp_is_set(&s->exit_timestamp))
4890 fprintf(f,
4891 "%sExit Timestamp: %s\n"
4892 "%sExit Code: %s\n"
4893 "%sExit Status: %i\n",
4894 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4895 prefix, sigchld_code_to_string(s->code),
4896 prefix, s->status);
4897 }
4898
4899 static char *exec_command_line(char **argv) {
4900 size_t k;
4901 char *n, *p, **a;
4902 bool first = true;
4903
4904 assert(argv);
4905
4906 k = 1;
4907 STRV_FOREACH(a, argv)
4908 k += strlen(*a)+3;
4909
4910 n = new(char, k);
4911 if (!n)
4912 return NULL;
4913
4914 p = n;
4915 STRV_FOREACH(a, argv) {
4916
4917 if (!first)
4918 *(p++) = ' ';
4919 else
4920 first = false;
4921
4922 if (strpbrk(*a, WHITESPACE)) {
4923 *(p++) = '\'';
4924 p = stpcpy(p, *a);
4925 *(p++) = '\'';
4926 } else
4927 p = stpcpy(p, *a);
4928
4929 }
4930
4931 *p = 0;
4932
4933 /* FIXME: this doesn't really handle arguments that have
4934 * spaces and ticks in them */
4935
4936 return n;
4937 }
4938
4939 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4940 _cleanup_free_ char *cmd = NULL;
4941 const char *prefix2;
4942
4943 assert(c);
4944 assert(f);
4945
4946 prefix = strempty(prefix);
4947 prefix2 = strjoina(prefix, "\t");
4948
4949 cmd = exec_command_line(c->argv);
4950 fprintf(f,
4951 "%sCommand Line: %s\n",
4952 prefix, cmd ? cmd : strerror_safe(ENOMEM));
4953
4954 exec_status_dump(&c->exec_status, f, prefix2);
4955 }
4956
4957 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4958 assert(f);
4959
4960 prefix = strempty(prefix);
4961
4962 LIST_FOREACH(command, c, c)
4963 exec_command_dump(c, f, prefix);
4964 }
4965
4966 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4967 ExecCommand *end;
4968
4969 assert(l);
4970 assert(e);
4971
4972 if (*l) {
4973 /* It's kind of important, that we keep the order here */
4974 LIST_FIND_TAIL(command, *l, end);
4975 LIST_INSERT_AFTER(command, *l, end, e);
4976 } else
4977 *l = e;
4978 }
4979
4980 int exec_command_set(ExecCommand *c, const char *path, ...) {
4981 va_list ap;
4982 char **l, *p;
4983
4984 assert(c);
4985 assert(path);
4986
4987 va_start(ap, path);
4988 l = strv_new_ap(path, ap);
4989 va_end(ap);
4990
4991 if (!l)
4992 return -ENOMEM;
4993
4994 p = strdup(path);
4995 if (!p) {
4996 strv_free(l);
4997 return -ENOMEM;
4998 }
4999
5000 free_and_replace(c->path, p);
5001
5002 return strv_free_and_replace(c->argv, l);
5003 }
5004
5005 int exec_command_append(ExecCommand *c, const char *path, ...) {
5006 _cleanup_strv_free_ char **l = NULL;
5007 va_list ap;
5008 int r;
5009
5010 assert(c);
5011 assert(path);
5012
5013 va_start(ap, path);
5014 l = strv_new_ap(path, ap);
5015 va_end(ap);
5016
5017 if (!l)
5018 return -ENOMEM;
5019
5020 r = strv_extend_strv(&c->argv, l, false);
5021 if (r < 0)
5022 return r;
5023
5024 return 0;
5025 }
5026
5027 static void *remove_tmpdir_thread(void *p) {
5028 _cleanup_free_ char *path = p;
5029
5030 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5031 return NULL;
5032 }
5033
5034 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5035 int r;
5036
5037 if (!rt)
5038 return NULL;
5039
5040 if (rt->manager)
5041 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5042
5043 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5044 if (destroy && rt->tmp_dir) {
5045 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5046
5047 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5048 if (r < 0) {
5049 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5050 free(rt->tmp_dir);
5051 }
5052
5053 rt->tmp_dir = NULL;
5054 }
5055
5056 if (destroy && rt->var_tmp_dir) {
5057 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5058
5059 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5060 if (r < 0) {
5061 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5062 free(rt->var_tmp_dir);
5063 }
5064
5065 rt->var_tmp_dir = NULL;
5066 }
5067
5068 rt->id = mfree(rt->id);
5069 rt->tmp_dir = mfree(rt->tmp_dir);
5070 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5071 safe_close_pair(rt->netns_storage_socket);
5072 return mfree(rt);
5073 }
5074
5075 static void exec_runtime_freep(ExecRuntime **rt) {
5076 (void) exec_runtime_free(*rt, false);
5077 }
5078
5079 static int exec_runtime_allocate(ExecRuntime **ret) {
5080 ExecRuntime *n;
5081
5082 assert(ret);
5083
5084 n = new(ExecRuntime, 1);
5085 if (!n)
5086 return -ENOMEM;
5087
5088 *n = (ExecRuntime) {
5089 .netns_storage_socket = { -1, -1 },
5090 };
5091
5092 *ret = n;
5093 return 0;
5094 }
5095
5096 static int exec_runtime_add(
5097 Manager *m,
5098 const char *id,
5099 const char *tmp_dir,
5100 const char *var_tmp_dir,
5101 const int netns_storage_socket[2],
5102 ExecRuntime **ret) {
5103
5104 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5105 int r;
5106
5107 assert(m);
5108 assert(id);
5109
5110 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5111 if (r < 0)
5112 return r;
5113
5114 r = exec_runtime_allocate(&rt);
5115 if (r < 0)
5116 return r;
5117
5118 rt->id = strdup(id);
5119 if (!rt->id)
5120 return -ENOMEM;
5121
5122 if (tmp_dir) {
5123 rt->tmp_dir = strdup(tmp_dir);
5124 if (!rt->tmp_dir)
5125 return -ENOMEM;
5126
5127 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5128 assert(var_tmp_dir);
5129 rt->var_tmp_dir = strdup(var_tmp_dir);
5130 if (!rt->var_tmp_dir)
5131 return -ENOMEM;
5132 }
5133
5134 if (netns_storage_socket) {
5135 rt->netns_storage_socket[0] = netns_storage_socket[0];
5136 rt->netns_storage_socket[1] = netns_storage_socket[1];
5137 }
5138
5139 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5140 if (r < 0)
5141 return r;
5142
5143 rt->manager = m;
5144
5145 if (ret)
5146 *ret = rt;
5147
5148 /* do not remove created ExecRuntime object when the operation succeeds. */
5149 rt = NULL;
5150 return 0;
5151 }
5152
5153 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5154 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5155 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5156 int r;
5157
5158 assert(m);
5159 assert(c);
5160 assert(id);
5161
5162 /* It is not necessary to create ExecRuntime object. */
5163 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5164 return 0;
5165
5166 if (c->private_tmp) {
5167 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5168 if (r < 0)
5169 return r;
5170 }
5171
5172 if (c->private_network || c->network_namespace_path) {
5173 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5174 return -errno;
5175 }
5176
5177 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5178 if (r < 0)
5179 return r;
5180
5181 /* Avoid cleanup */
5182 netns_storage_socket[0] = netns_storage_socket[1] = -1;
5183 return 1;
5184 }
5185
5186 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5187 ExecRuntime *rt;
5188 int r;
5189
5190 assert(m);
5191 assert(id);
5192 assert(ret);
5193
5194 rt = hashmap_get(m->exec_runtime_by_id, id);
5195 if (rt)
5196 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5197 goto ref;
5198
5199 if (!create)
5200 return 0;
5201
5202 /* If not found, then create a new object. */
5203 r = exec_runtime_make(m, c, id, &rt);
5204 if (r <= 0)
5205 /* When r == 0, it is not necessary to create ExecRuntime object. */
5206 return r;
5207
5208 ref:
5209 /* increment reference counter. */
5210 rt->n_ref++;
5211 *ret = rt;
5212 return 1;
5213 }
5214
5215 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5216 if (!rt)
5217 return NULL;
5218
5219 assert(rt->n_ref > 0);
5220
5221 rt->n_ref--;
5222 if (rt->n_ref > 0)
5223 return NULL;
5224
5225 return exec_runtime_free(rt, destroy);
5226 }
5227
5228 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5229 ExecRuntime *rt;
5230 Iterator i;
5231
5232 assert(m);
5233 assert(f);
5234 assert(fds);
5235
5236 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5237 fprintf(f, "exec-runtime=%s", rt->id);
5238
5239 if (rt->tmp_dir)
5240 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5241
5242 if (rt->var_tmp_dir)
5243 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5244
5245 if (rt->netns_storage_socket[0] >= 0) {
5246 int copy;
5247
5248 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5249 if (copy < 0)
5250 return copy;
5251
5252 fprintf(f, " netns-socket-0=%i", copy);
5253 }
5254
5255 if (rt->netns_storage_socket[1] >= 0) {
5256 int copy;
5257
5258 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5259 if (copy < 0)
5260 return copy;
5261
5262 fprintf(f, " netns-socket-1=%i", copy);
5263 }
5264
5265 fputc('\n', f);
5266 }
5267
5268 return 0;
5269 }
5270
5271 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5272 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5273 ExecRuntime *rt;
5274 int r;
5275
5276 /* This is for the migration from old (v237 or earlier) deserialization text.
5277 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5278 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5279 * so or not from the serialized text, then we always creates a new object owned by this. */
5280
5281 assert(u);
5282 assert(key);
5283 assert(value);
5284
5285 /* Manager manages ExecRuntime objects by the unit id.
5286 * So, we omit the serialized text when the unit does not have id (yet?)... */
5287 if (isempty(u->id)) {
5288 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5289 return 0;
5290 }
5291
5292 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5293 if (r < 0) {
5294 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5295 return 0;
5296 }
5297
5298 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5299 if (!rt) {
5300 r = exec_runtime_allocate(&rt_create);
5301 if (r < 0)
5302 return log_oom();
5303
5304 rt_create->id = strdup(u->id);
5305 if (!rt_create->id)
5306 return log_oom();
5307
5308 rt = rt_create;
5309 }
5310
5311 if (streq(key, "tmp-dir")) {
5312 char *copy;
5313
5314 copy = strdup(value);
5315 if (!copy)
5316 return log_oom();
5317
5318 free_and_replace(rt->tmp_dir, copy);
5319
5320 } else if (streq(key, "var-tmp-dir")) {
5321 char *copy;
5322
5323 copy = strdup(value);
5324 if (!copy)
5325 return log_oom();
5326
5327 free_and_replace(rt->var_tmp_dir, copy);
5328
5329 } else if (streq(key, "netns-socket-0")) {
5330 int fd;
5331
5332 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5333 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5334 return 0;
5335 }
5336
5337 safe_close(rt->netns_storage_socket[0]);
5338 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5339
5340 } else if (streq(key, "netns-socket-1")) {
5341 int fd;
5342
5343 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5344 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5345 return 0;
5346 }
5347
5348 safe_close(rt->netns_storage_socket[1]);
5349 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5350 } else
5351 return 0;
5352
5353 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5354 if (rt_create) {
5355 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5356 if (r < 0) {
5357 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5358 return 0;
5359 }
5360
5361 rt_create->manager = u->manager;
5362
5363 /* Avoid cleanup */
5364 rt_create = NULL;
5365 }
5366
5367 return 1;
5368 }
5369
5370 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5371 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5372 int r, fd0 = -1, fd1 = -1;
5373 const char *p, *v = value;
5374 size_t n;
5375
5376 assert(m);
5377 assert(value);
5378 assert(fds);
5379
5380 n = strcspn(v, " ");
5381 id = strndupa(v, n);
5382 if (v[n] != ' ')
5383 goto finalize;
5384 p = v + n + 1;
5385
5386 v = startswith(p, "tmp-dir=");
5387 if (v) {
5388 n = strcspn(v, " ");
5389 tmp_dir = strndupa(v, n);
5390 if (v[n] != ' ')
5391 goto finalize;
5392 p = v + n + 1;
5393 }
5394
5395 v = startswith(p, "var-tmp-dir=");
5396 if (v) {
5397 n = strcspn(v, " ");
5398 var_tmp_dir = strndupa(v, n);
5399 if (v[n] != ' ')
5400 goto finalize;
5401 p = v + n + 1;
5402 }
5403
5404 v = startswith(p, "netns-socket-0=");
5405 if (v) {
5406 char *buf;
5407
5408 n = strcspn(v, " ");
5409 buf = strndupa(v, n);
5410 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5411 log_debug("Unable to process exec-runtime netns fd specification.");
5412 return;
5413 }
5414 fd0 = fdset_remove(fds, fd0);
5415 if (v[n] != ' ')
5416 goto finalize;
5417 p = v + n + 1;
5418 }
5419
5420 v = startswith(p, "netns-socket-1=");
5421 if (v) {
5422 char *buf;
5423
5424 n = strcspn(v, " ");
5425 buf = strndupa(v, n);
5426 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5427 log_debug("Unable to process exec-runtime netns fd specification.");
5428 return;
5429 }
5430 fd1 = fdset_remove(fds, fd1);
5431 }
5432
5433 finalize:
5434
5435 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5436 if (r < 0)
5437 log_debug_errno(r, "Failed to add exec-runtime: %m");
5438 }
5439
5440 void exec_runtime_vacuum(Manager *m) {
5441 ExecRuntime *rt;
5442 Iterator i;
5443
5444 assert(m);
5445
5446 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5447
5448 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5449 if (rt->n_ref > 0)
5450 continue;
5451
5452 (void) exec_runtime_free(rt, false);
5453 }
5454 }
5455
5456 void exec_params_clear(ExecParameters *p) {
5457 if (!p)
5458 return;
5459
5460 strv_free(p->environment);
5461 }
5462
5463 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5464 [EXEC_INPUT_NULL] = "null",
5465 [EXEC_INPUT_TTY] = "tty",
5466 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5467 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5468 [EXEC_INPUT_SOCKET] = "socket",
5469 [EXEC_INPUT_NAMED_FD] = "fd",
5470 [EXEC_INPUT_DATA] = "data",
5471 [EXEC_INPUT_FILE] = "file",
5472 };
5473
5474 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5475
5476 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5477 [EXEC_OUTPUT_INHERIT] = "inherit",
5478 [EXEC_OUTPUT_NULL] = "null",
5479 [EXEC_OUTPUT_TTY] = "tty",
5480 [EXEC_OUTPUT_SYSLOG] = "syslog",
5481 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5482 [EXEC_OUTPUT_KMSG] = "kmsg",
5483 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5484 [EXEC_OUTPUT_JOURNAL] = "journal",
5485 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5486 [EXEC_OUTPUT_SOCKET] = "socket",
5487 [EXEC_OUTPUT_NAMED_FD] = "fd",
5488 [EXEC_OUTPUT_FILE] = "file",
5489 [EXEC_OUTPUT_FILE_APPEND] = "append",
5490 };
5491
5492 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5493
5494 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5495 [EXEC_UTMP_INIT] = "init",
5496 [EXEC_UTMP_LOGIN] = "login",
5497 [EXEC_UTMP_USER] = "user",
5498 };
5499
5500 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5501
5502 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5503 [EXEC_PRESERVE_NO] = "no",
5504 [EXEC_PRESERVE_YES] = "yes",
5505 [EXEC_PRESERVE_RESTART] = "restart",
5506 };
5507
5508 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5509
5510 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5511 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5512 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5513 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5514 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5515 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5516 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5517 };
5518
5519 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5520
5521 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5522 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5523 * directories, specifically .timer units with their timestamp touch file. */
5524 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5525 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5526 [EXEC_DIRECTORY_STATE] = "state",
5527 [EXEC_DIRECTORY_CACHE] = "cache",
5528 [EXEC_DIRECTORY_LOGS] = "logs",
5529 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5530 };
5531
5532 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5533
5534 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5535 * the service payload in. */
5536 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5537 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5538 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5539 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5540 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5541 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5542 };
5543
5544 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5545
5546 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5547 [EXEC_KEYRING_INHERIT] = "inherit",
5548 [EXEC_KEYRING_PRIVATE] = "private",
5549 [EXEC_KEYRING_SHARED] = "shared",
5550 };
5551
5552 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);