]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
core: also remove private directories by systemctl clean
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <glob.h>
6 #include <grp.h>
7 #include <poll.h>
8 #include <signal.h>
9 #include <string.h>
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
12 #include <sys/mman.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/shm.h>
16 #include <sys/socket.h>
17 #include <sys/stat.h>
18 #include <sys/types.h>
19 #include <sys/un.h>
20 #include <unistd.h>
21 #include <utmpx.h>
22
23 #if HAVE_PAM
24 #include <security/pam_appl.h>
25 #endif
26
27 #if HAVE_SELINUX
28 #include <selinux/selinux.h>
29 #endif
30
31 #if HAVE_SECCOMP
32 #include <seccomp.h>
33 #endif
34
35 #if HAVE_APPARMOR
36 #include <sys/apparmor.h>
37 #endif
38
39 #include "sd-messages.h"
40
41 #include "af-list.h"
42 #include "alloc-util.h"
43 #if HAVE_APPARMOR
44 #include "apparmor-util.h"
45 #endif
46 #include "async.h"
47 #include "barrier.h"
48 #include "cap-list.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cpu-set-util.h"
52 #include "def.h"
53 #include "env-file.h"
54 #include "env-util.h"
55 #include "errno-list.h"
56 #include "execute.h"
57 #include "exit-status.h"
58 #include "fd-util.h"
59 #include "format-util.h"
60 #include "fs-util.h"
61 #include "glob-util.h"
62 #include "io-util.h"
63 #include "ioprio.h"
64 #include "label.h"
65 #include "log.h"
66 #include "macro.h"
67 #include "manager.h"
68 #include "memory-util.h"
69 #include "missing.h"
70 #include "mkdir.h"
71 #include "namespace.h"
72 #include "parse-util.h"
73 #include "path-util.h"
74 #include "process-util.h"
75 #include "rlimit-util.h"
76 #include "rm-rf.h"
77 #if HAVE_SECCOMP
78 #include "seccomp-util.h"
79 #endif
80 #include "securebits-util.h"
81 #include "selinux-util.h"
82 #include "signal-util.h"
83 #include "smack-util.h"
84 #include "socket-util.h"
85 #include "special.h"
86 #include "stat-util.h"
87 #include "string-table.h"
88 #include "string-util.h"
89 #include "strv.h"
90 #include "syslog-util.h"
91 #include "terminal-util.h"
92 #include "umask-util.h"
93 #include "unit.h"
94 #include "user-util.h"
95 #include "utmp-wtmp.h"
96
97 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
98 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
99
100 #define SNDBUF_SIZE (8*1024*1024)
101
102 static int shift_fds(int fds[], size_t n_fds) {
103 int start, restart_from;
104
105 if (n_fds <= 0)
106 return 0;
107
108 /* Modifies the fds array! (sorts it) */
109
110 assert(fds);
111
112 start = 0;
113 for (;;) {
114 int i;
115
116 restart_from = -1;
117
118 for (i = start; i < (int) n_fds; i++) {
119 int nfd;
120
121 /* Already at right index? */
122 if (fds[i] == i+3)
123 continue;
124
125 nfd = fcntl(fds[i], F_DUPFD, i + 3);
126 if (nfd < 0)
127 return -errno;
128
129 safe_close(fds[i]);
130 fds[i] = nfd;
131
132 /* Hmm, the fd we wanted isn't free? Then
133 * let's remember that and try again from here */
134 if (nfd != i+3 && restart_from < 0)
135 restart_from = i;
136 }
137
138 if (restart_from < 0)
139 break;
140
141 start = restart_from;
142 }
143
144 return 0;
145 }
146
147 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
148 size_t i, n_fds;
149 int r;
150
151 n_fds = n_socket_fds + n_storage_fds;
152 if (n_fds <= 0)
153 return 0;
154
155 assert(fds);
156
157 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
158 * O_NONBLOCK only applies to socket activation though. */
159
160 for (i = 0; i < n_fds; i++) {
161
162 if (i < n_socket_fds) {
163 r = fd_nonblock(fds[i], nonblock);
164 if (r < 0)
165 return r;
166 }
167
168 /* We unconditionally drop FD_CLOEXEC from the fds,
169 * since after all we want to pass these fds to our
170 * children */
171
172 r = fd_cloexec(fds[i], false);
173 if (r < 0)
174 return r;
175 }
176
177 return 0;
178 }
179
180 static const char *exec_context_tty_path(const ExecContext *context) {
181 assert(context);
182
183 if (context->stdio_as_fds)
184 return NULL;
185
186 if (context->tty_path)
187 return context->tty_path;
188
189 return "/dev/console";
190 }
191
192 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
193 const char *path;
194
195 assert(context);
196
197 path = exec_context_tty_path(context);
198
199 if (context->tty_vhangup) {
200 if (p && p->stdin_fd >= 0)
201 (void) terminal_vhangup_fd(p->stdin_fd);
202 else if (path)
203 (void) terminal_vhangup(path);
204 }
205
206 if (context->tty_reset) {
207 if (p && p->stdin_fd >= 0)
208 (void) reset_terminal_fd(p->stdin_fd, true);
209 else if (path)
210 (void) reset_terminal(path);
211 }
212
213 if (context->tty_vt_disallocate && path)
214 (void) vt_disallocate(path);
215 }
216
217 static bool is_terminal_input(ExecInput i) {
218 return IN_SET(i,
219 EXEC_INPUT_TTY,
220 EXEC_INPUT_TTY_FORCE,
221 EXEC_INPUT_TTY_FAIL);
222 }
223
224 static bool is_terminal_output(ExecOutput o) {
225 return IN_SET(o,
226 EXEC_OUTPUT_TTY,
227 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
228 EXEC_OUTPUT_KMSG_AND_CONSOLE,
229 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
230 }
231
232 static bool is_syslog_output(ExecOutput o) {
233 return IN_SET(o,
234 EXEC_OUTPUT_SYSLOG,
235 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
236 }
237
238 static bool is_kmsg_output(ExecOutput o) {
239 return IN_SET(o,
240 EXEC_OUTPUT_KMSG,
241 EXEC_OUTPUT_KMSG_AND_CONSOLE);
242 }
243
244 static bool exec_context_needs_term(const ExecContext *c) {
245 assert(c);
246
247 /* Return true if the execution context suggests we should set $TERM to something useful. */
248
249 if (is_terminal_input(c->std_input))
250 return true;
251
252 if (is_terminal_output(c->std_output))
253 return true;
254
255 if (is_terminal_output(c->std_error))
256 return true;
257
258 return !!c->tty_path;
259 }
260
261 static int open_null_as(int flags, int nfd) {
262 int fd;
263
264 assert(nfd >= 0);
265
266 fd = open("/dev/null", flags|O_NOCTTY);
267 if (fd < 0)
268 return -errno;
269
270 return move_fd(fd, nfd, false);
271 }
272
273 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
274 static const union sockaddr_union sa = {
275 .un.sun_family = AF_UNIX,
276 .un.sun_path = "/run/systemd/journal/stdout",
277 };
278 uid_t olduid = UID_INVALID;
279 gid_t oldgid = GID_INVALID;
280 int r;
281
282 if (gid_is_valid(gid)) {
283 oldgid = getgid();
284
285 if (setegid(gid) < 0)
286 return -errno;
287 }
288
289 if (uid_is_valid(uid)) {
290 olduid = getuid();
291
292 if (seteuid(uid) < 0) {
293 r = -errno;
294 goto restore_gid;
295 }
296 }
297
298 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
299
300 /* If we fail to restore the uid or gid, things will likely
301 fail later on. This should only happen if an LSM interferes. */
302
303 if (uid_is_valid(uid))
304 (void) seteuid(olduid);
305
306 restore_gid:
307 if (gid_is_valid(gid))
308 (void) setegid(oldgid);
309
310 return r;
311 }
312
313 static int connect_logger_as(
314 const Unit *unit,
315 const ExecContext *context,
316 const ExecParameters *params,
317 ExecOutput output,
318 const char *ident,
319 int nfd,
320 uid_t uid,
321 gid_t gid) {
322
323 _cleanup_close_ int fd = -1;
324 int r;
325
326 assert(context);
327 assert(params);
328 assert(output < _EXEC_OUTPUT_MAX);
329 assert(ident);
330 assert(nfd >= 0);
331
332 fd = socket(AF_UNIX, SOCK_STREAM, 0);
333 if (fd < 0)
334 return -errno;
335
336 r = connect_journal_socket(fd, uid, gid);
337 if (r < 0)
338 return r;
339
340 if (shutdown(fd, SHUT_RD) < 0)
341 return -errno;
342
343 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
344
345 if (dprintf(fd,
346 "%s\n"
347 "%s\n"
348 "%i\n"
349 "%i\n"
350 "%i\n"
351 "%i\n"
352 "%i\n",
353 context->syslog_identifier ?: ident,
354 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
355 context->syslog_priority,
356 !!context->syslog_level_prefix,
357 is_syslog_output(output),
358 is_kmsg_output(output),
359 is_terminal_output(output)) < 0)
360 return -errno;
361
362 return move_fd(TAKE_FD(fd), nfd, false);
363 }
364
365 static int open_terminal_as(const char *path, int flags, int nfd) {
366 int fd;
367
368 assert(path);
369 assert(nfd >= 0);
370
371 fd = open_terminal(path, flags | O_NOCTTY);
372 if (fd < 0)
373 return fd;
374
375 return move_fd(fd, nfd, false);
376 }
377
378 static int acquire_path(const char *path, int flags, mode_t mode) {
379 union sockaddr_union sa = {};
380 _cleanup_close_ int fd = -1;
381 int r, salen;
382
383 assert(path);
384
385 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
386 flags |= O_CREAT;
387
388 fd = open(path, flags|O_NOCTTY, mode);
389 if (fd >= 0)
390 return TAKE_FD(fd);
391
392 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
393 return -errno;
394 if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
395 return -ENXIO;
396
397 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
398
399 fd = socket(AF_UNIX, SOCK_STREAM, 0);
400 if (fd < 0)
401 return -errno;
402
403 salen = sockaddr_un_set_path(&sa.un, path);
404 if (salen < 0)
405 return salen;
406
407 if (connect(fd, &sa.sa, salen) < 0)
408 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
409 * indication that his wasn't an AF_UNIX socket after all */
410
411 if ((flags & O_ACCMODE) == O_RDONLY)
412 r = shutdown(fd, SHUT_WR);
413 else if ((flags & O_ACCMODE) == O_WRONLY)
414 r = shutdown(fd, SHUT_RD);
415 else
416 return TAKE_FD(fd);
417 if (r < 0)
418 return -errno;
419
420 return TAKE_FD(fd);
421 }
422
423 static int fixup_input(
424 const ExecContext *context,
425 int socket_fd,
426 bool apply_tty_stdin) {
427
428 ExecInput std_input;
429
430 assert(context);
431
432 std_input = context->std_input;
433
434 if (is_terminal_input(std_input) && !apply_tty_stdin)
435 return EXEC_INPUT_NULL;
436
437 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
438 return EXEC_INPUT_NULL;
439
440 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
441 return EXEC_INPUT_NULL;
442
443 return std_input;
444 }
445
446 static int fixup_output(ExecOutput std_output, int socket_fd) {
447
448 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
449 return EXEC_OUTPUT_INHERIT;
450
451 return std_output;
452 }
453
454 static int setup_input(
455 const ExecContext *context,
456 const ExecParameters *params,
457 int socket_fd,
458 const int named_iofds[static 3]) {
459
460 ExecInput i;
461
462 assert(context);
463 assert(params);
464 assert(named_iofds);
465
466 if (params->stdin_fd >= 0) {
467 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
468 return -errno;
469
470 /* Try to make this the controlling tty, if it is a tty, and reset it */
471 if (isatty(STDIN_FILENO)) {
472 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
473 (void) reset_terminal_fd(STDIN_FILENO, true);
474 }
475
476 return STDIN_FILENO;
477 }
478
479 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
480
481 switch (i) {
482
483 case EXEC_INPUT_NULL:
484 return open_null_as(O_RDONLY, STDIN_FILENO);
485
486 case EXEC_INPUT_TTY:
487 case EXEC_INPUT_TTY_FORCE:
488 case EXEC_INPUT_TTY_FAIL: {
489 int fd;
490
491 fd = acquire_terminal(exec_context_tty_path(context),
492 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
493 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
494 ACQUIRE_TERMINAL_WAIT,
495 USEC_INFINITY);
496 if (fd < 0)
497 return fd;
498
499 return move_fd(fd, STDIN_FILENO, false);
500 }
501
502 case EXEC_INPUT_SOCKET:
503 assert(socket_fd >= 0);
504
505 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
506
507 case EXEC_INPUT_NAMED_FD:
508 assert(named_iofds[STDIN_FILENO] >= 0);
509
510 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
511 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
512
513 case EXEC_INPUT_DATA: {
514 int fd;
515
516 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
517 if (fd < 0)
518 return fd;
519
520 return move_fd(fd, STDIN_FILENO, false);
521 }
522
523 case EXEC_INPUT_FILE: {
524 bool rw;
525 int fd;
526
527 assert(context->stdio_file[STDIN_FILENO]);
528
529 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
530 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
531
532 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
533 if (fd < 0)
534 return fd;
535
536 return move_fd(fd, STDIN_FILENO, false);
537 }
538
539 default:
540 assert_not_reached("Unknown input type");
541 }
542 }
543
544 static bool can_inherit_stderr_from_stdout(
545 const ExecContext *context,
546 ExecOutput o,
547 ExecOutput e) {
548
549 assert(context);
550
551 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
552 * stderr fd */
553
554 if (e == EXEC_OUTPUT_INHERIT)
555 return true;
556 if (e != o)
557 return false;
558
559 if (e == EXEC_OUTPUT_NAMED_FD)
560 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
561
562 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
563 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
564
565 return true;
566 }
567
568 static int setup_output(
569 const Unit *unit,
570 const ExecContext *context,
571 const ExecParameters *params,
572 int fileno,
573 int socket_fd,
574 const int named_iofds[static 3],
575 const char *ident,
576 uid_t uid,
577 gid_t gid,
578 dev_t *journal_stream_dev,
579 ino_t *journal_stream_ino) {
580
581 ExecOutput o;
582 ExecInput i;
583 int r;
584
585 assert(unit);
586 assert(context);
587 assert(params);
588 assert(ident);
589 assert(journal_stream_dev);
590 assert(journal_stream_ino);
591
592 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
593
594 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
595 return -errno;
596
597 return STDOUT_FILENO;
598 }
599
600 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
601 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
602 return -errno;
603
604 return STDERR_FILENO;
605 }
606
607 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
608 o = fixup_output(context->std_output, socket_fd);
609
610 if (fileno == STDERR_FILENO) {
611 ExecOutput e;
612 e = fixup_output(context->std_error, socket_fd);
613
614 /* This expects the input and output are already set up */
615
616 /* Don't change the stderr file descriptor if we inherit all
617 * the way and are not on a tty */
618 if (e == EXEC_OUTPUT_INHERIT &&
619 o == EXEC_OUTPUT_INHERIT &&
620 i == EXEC_INPUT_NULL &&
621 !is_terminal_input(context->std_input) &&
622 getppid () != 1)
623 return fileno;
624
625 /* Duplicate from stdout if possible */
626 if (can_inherit_stderr_from_stdout(context, o, e))
627 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
628
629 o = e;
630
631 } else if (o == EXEC_OUTPUT_INHERIT) {
632 /* If input got downgraded, inherit the original value */
633 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
634 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
635
636 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
637 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
638 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
639
640 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
641 if (getppid() != 1)
642 return fileno;
643
644 /* We need to open /dev/null here anew, to get the right access mode. */
645 return open_null_as(O_WRONLY, fileno);
646 }
647
648 switch (o) {
649
650 case EXEC_OUTPUT_NULL:
651 return open_null_as(O_WRONLY, fileno);
652
653 case EXEC_OUTPUT_TTY:
654 if (is_terminal_input(i))
655 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
656
657 /* We don't reset the terminal if this is just about output */
658 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
659
660 case EXEC_OUTPUT_SYSLOG:
661 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
662 case EXEC_OUTPUT_KMSG:
663 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
664 case EXEC_OUTPUT_JOURNAL:
665 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
666 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
667 if (r < 0) {
668 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
669 r = open_null_as(O_WRONLY, fileno);
670 } else {
671 struct stat st;
672
673 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
674 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
675 * services to detect whether they are connected to the journal or not.
676 *
677 * If both stdout and stderr are connected to a stream then let's make sure to store the data
678 * about STDERR as that's usually the best way to do logging. */
679
680 if (fstat(fileno, &st) >= 0 &&
681 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
682 *journal_stream_dev = st.st_dev;
683 *journal_stream_ino = st.st_ino;
684 }
685 }
686 return r;
687
688 case EXEC_OUTPUT_SOCKET:
689 assert(socket_fd >= 0);
690
691 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
692
693 case EXEC_OUTPUT_NAMED_FD:
694 assert(named_iofds[fileno] >= 0);
695
696 (void) fd_nonblock(named_iofds[fileno], false);
697 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
698
699 case EXEC_OUTPUT_FILE:
700 case EXEC_OUTPUT_FILE_APPEND: {
701 bool rw;
702 int fd, flags;
703
704 assert(context->stdio_file[fileno]);
705
706 rw = context->std_input == EXEC_INPUT_FILE &&
707 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
708
709 if (rw)
710 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
711
712 flags = O_WRONLY;
713 if (o == EXEC_OUTPUT_FILE_APPEND)
714 flags |= O_APPEND;
715
716 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
717 if (fd < 0)
718 return fd;
719
720 return move_fd(fd, fileno, 0);
721 }
722
723 default:
724 assert_not_reached("Unknown error type");
725 }
726 }
727
728 static int chown_terminal(int fd, uid_t uid) {
729 int r;
730
731 assert(fd >= 0);
732
733 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
734 if (isatty(fd) < 1) {
735 if (IN_SET(errno, EINVAL, ENOTTY))
736 return 0; /* not a tty */
737
738 return -errno;
739 }
740
741 /* This might fail. What matters are the results. */
742 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
743 if (r < 0)
744 return r;
745
746 return 1;
747 }
748
749 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
750 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
751 int r;
752
753 assert(_saved_stdin);
754 assert(_saved_stdout);
755
756 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
757 if (saved_stdin < 0)
758 return -errno;
759
760 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
761 if (saved_stdout < 0)
762 return -errno;
763
764 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
765 if (fd < 0)
766 return fd;
767
768 r = chown_terminal(fd, getuid());
769 if (r < 0)
770 return r;
771
772 r = reset_terminal_fd(fd, true);
773 if (r < 0)
774 return r;
775
776 r = rearrange_stdio(fd, fd, STDERR_FILENO);
777 fd = -1;
778 if (r < 0)
779 return r;
780
781 *_saved_stdin = saved_stdin;
782 *_saved_stdout = saved_stdout;
783
784 saved_stdin = saved_stdout = -1;
785
786 return 0;
787 }
788
789 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
790 assert(err < 0);
791
792 if (err == -ETIMEDOUT)
793 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
794 else {
795 errno = -err;
796 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
797 }
798 }
799
800 static void write_confirm_error(int err, const char *vc, const Unit *u) {
801 _cleanup_close_ int fd = -1;
802
803 assert(vc);
804
805 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
806 if (fd < 0)
807 return;
808
809 write_confirm_error_fd(err, fd, u);
810 }
811
812 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
813 int r = 0;
814
815 assert(saved_stdin);
816 assert(saved_stdout);
817
818 release_terminal();
819
820 if (*saved_stdin >= 0)
821 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
822 r = -errno;
823
824 if (*saved_stdout >= 0)
825 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
826 r = -errno;
827
828 *saved_stdin = safe_close(*saved_stdin);
829 *saved_stdout = safe_close(*saved_stdout);
830
831 return r;
832 }
833
834 enum {
835 CONFIRM_PRETEND_FAILURE = -1,
836 CONFIRM_PRETEND_SUCCESS = 0,
837 CONFIRM_EXECUTE = 1,
838 };
839
840 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
841 int saved_stdout = -1, saved_stdin = -1, r;
842 _cleanup_free_ char *e = NULL;
843 char c;
844
845 /* For any internal errors, assume a positive response. */
846 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
847 if (r < 0) {
848 write_confirm_error(r, vc, u);
849 return CONFIRM_EXECUTE;
850 }
851
852 /* confirm_spawn might have been disabled while we were sleeping. */
853 if (manager_is_confirm_spawn_disabled(u->manager)) {
854 r = 1;
855 goto restore_stdio;
856 }
857
858 e = ellipsize(cmdline, 60, 100);
859 if (!e) {
860 log_oom();
861 r = CONFIRM_EXECUTE;
862 goto restore_stdio;
863 }
864
865 for (;;) {
866 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
867 if (r < 0) {
868 write_confirm_error_fd(r, STDOUT_FILENO, u);
869 r = CONFIRM_EXECUTE;
870 goto restore_stdio;
871 }
872
873 switch (c) {
874 case 'c':
875 printf("Resuming normal execution.\n");
876 manager_disable_confirm_spawn();
877 r = 1;
878 break;
879 case 'D':
880 unit_dump(u, stdout, " ");
881 continue; /* ask again */
882 case 'f':
883 printf("Failing execution.\n");
884 r = CONFIRM_PRETEND_FAILURE;
885 break;
886 case 'h':
887 printf(" c - continue, proceed without asking anymore\n"
888 " D - dump, show the state of the unit\n"
889 " f - fail, don't execute the command and pretend it failed\n"
890 " h - help\n"
891 " i - info, show a short summary of the unit\n"
892 " j - jobs, show jobs that are in progress\n"
893 " s - skip, don't execute the command and pretend it succeeded\n"
894 " y - yes, execute the command\n");
895 continue; /* ask again */
896 case 'i':
897 printf(" Description: %s\n"
898 " Unit: %s\n"
899 " Command: %s\n",
900 u->id, u->description, cmdline);
901 continue; /* ask again */
902 case 'j':
903 manager_dump_jobs(u->manager, stdout, " ");
904 continue; /* ask again */
905 case 'n':
906 /* 'n' was removed in favor of 'f'. */
907 printf("Didn't understand 'n', did you mean 'f'?\n");
908 continue; /* ask again */
909 case 's':
910 printf("Skipping execution.\n");
911 r = CONFIRM_PRETEND_SUCCESS;
912 break;
913 case 'y':
914 r = CONFIRM_EXECUTE;
915 break;
916 default:
917 assert_not_reached("Unhandled choice");
918 }
919 break;
920 }
921
922 restore_stdio:
923 restore_confirm_stdio(&saved_stdin, &saved_stdout);
924 return r;
925 }
926
927 static int get_fixed_user(const ExecContext *c, const char **user,
928 uid_t *uid, gid_t *gid,
929 const char **home, const char **shell) {
930 int r;
931 const char *name;
932
933 assert(c);
934
935 if (!c->user)
936 return 0;
937
938 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
939 * (i.e. are "/" or "/bin/nologin"). */
940
941 name = c->user;
942 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
943 if (r < 0)
944 return r;
945
946 *user = name;
947 return 0;
948 }
949
950 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
951 int r;
952 const char *name;
953
954 assert(c);
955
956 if (!c->group)
957 return 0;
958
959 name = c->group;
960 r = get_group_creds(&name, gid, 0);
961 if (r < 0)
962 return r;
963
964 *group = name;
965 return 0;
966 }
967
968 static int get_supplementary_groups(const ExecContext *c, const char *user,
969 const char *group, gid_t gid,
970 gid_t **supplementary_gids, int *ngids) {
971 char **i;
972 int r, k = 0;
973 int ngroups_max;
974 bool keep_groups = false;
975 gid_t *groups = NULL;
976 _cleanup_free_ gid_t *l_gids = NULL;
977
978 assert(c);
979
980 /*
981 * If user is given, then lookup GID and supplementary groups list.
982 * We avoid NSS lookups for gid=0. Also we have to initialize groups
983 * here and as early as possible so we keep the list of supplementary
984 * groups of the caller.
985 */
986 if (user && gid_is_valid(gid) && gid != 0) {
987 /* First step, initialize groups from /etc/groups */
988 if (initgroups(user, gid) < 0)
989 return -errno;
990
991 keep_groups = true;
992 }
993
994 if (strv_isempty(c->supplementary_groups))
995 return 0;
996
997 /*
998 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
999 * be positive, otherwise fail.
1000 */
1001 errno = 0;
1002 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1003 if (ngroups_max <= 0)
1004 return errno_or_else(EOPNOTSUPP);
1005
1006 l_gids = new(gid_t, ngroups_max);
1007 if (!l_gids)
1008 return -ENOMEM;
1009
1010 if (keep_groups) {
1011 /*
1012 * Lookup the list of groups that the user belongs to, we
1013 * avoid NSS lookups here too for gid=0.
1014 */
1015 k = ngroups_max;
1016 if (getgrouplist(user, gid, l_gids, &k) < 0)
1017 return -EINVAL;
1018 } else
1019 k = 0;
1020
1021 STRV_FOREACH(i, c->supplementary_groups) {
1022 const char *g;
1023
1024 if (k >= ngroups_max)
1025 return -E2BIG;
1026
1027 g = *i;
1028 r = get_group_creds(&g, l_gids+k, 0);
1029 if (r < 0)
1030 return r;
1031
1032 k++;
1033 }
1034
1035 /*
1036 * Sets ngids to zero to drop all supplementary groups, happens
1037 * when we are under root and SupplementaryGroups= is empty.
1038 */
1039 if (k == 0) {
1040 *ngids = 0;
1041 return 0;
1042 }
1043
1044 /* Otherwise get the final list of supplementary groups */
1045 groups = memdup(l_gids, sizeof(gid_t) * k);
1046 if (!groups)
1047 return -ENOMEM;
1048
1049 *supplementary_gids = groups;
1050 *ngids = k;
1051
1052 groups = NULL;
1053
1054 return 0;
1055 }
1056
1057 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1058 int r;
1059
1060 /* Handle SupplementaryGroups= if it is not empty */
1061 if (ngids > 0) {
1062 r = maybe_setgroups(ngids, supplementary_gids);
1063 if (r < 0)
1064 return r;
1065 }
1066
1067 if (gid_is_valid(gid)) {
1068 /* Then set our gids */
1069 if (setresgid(gid, gid, gid) < 0)
1070 return -errno;
1071 }
1072
1073 return 0;
1074 }
1075
1076 static int enforce_user(const ExecContext *context, uid_t uid) {
1077 assert(context);
1078
1079 if (!uid_is_valid(uid))
1080 return 0;
1081
1082 /* Sets (but doesn't look up) the uid and make sure we keep the
1083 * capabilities while doing so. */
1084
1085 if (context->capability_ambient_set != 0) {
1086
1087 /* First step: If we need to keep capabilities but
1088 * drop privileges we need to make sure we keep our
1089 * caps, while we drop privileges. */
1090 if (uid != 0) {
1091 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1092
1093 if (prctl(PR_GET_SECUREBITS) != sb)
1094 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1095 return -errno;
1096 }
1097 }
1098
1099 /* Second step: actually set the uids */
1100 if (setresuid(uid, uid, uid) < 0)
1101 return -errno;
1102
1103 /* At this point we should have all necessary capabilities but
1104 are otherwise a normal user. However, the caps might got
1105 corrupted due to the setresuid() so we need clean them up
1106 later. This is done outside of this call. */
1107
1108 return 0;
1109 }
1110
1111 #if HAVE_PAM
1112
1113 static int null_conv(
1114 int num_msg,
1115 const struct pam_message **msg,
1116 struct pam_response **resp,
1117 void *appdata_ptr) {
1118
1119 /* We don't support conversations */
1120
1121 return PAM_CONV_ERR;
1122 }
1123
1124 #endif
1125
1126 static int setup_pam(
1127 const char *name,
1128 const char *user,
1129 uid_t uid,
1130 gid_t gid,
1131 const char *tty,
1132 char ***env,
1133 int fds[], size_t n_fds) {
1134
1135 #if HAVE_PAM
1136
1137 static const struct pam_conv conv = {
1138 .conv = null_conv,
1139 .appdata_ptr = NULL
1140 };
1141
1142 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1143 pam_handle_t *handle = NULL;
1144 sigset_t old_ss;
1145 int pam_code = PAM_SUCCESS, r;
1146 char **nv, **e = NULL;
1147 bool close_session = false;
1148 pid_t pam_pid = 0, parent_pid;
1149 int flags = 0;
1150
1151 assert(name);
1152 assert(user);
1153 assert(env);
1154
1155 /* We set up PAM in the parent process, then fork. The child
1156 * will then stay around until killed via PR_GET_PDEATHSIG or
1157 * systemd via the cgroup logic. It will then remove the PAM
1158 * session again. The parent process will exec() the actual
1159 * daemon. We do things this way to ensure that the main PID
1160 * of the daemon is the one we initially fork()ed. */
1161
1162 r = barrier_create(&barrier);
1163 if (r < 0)
1164 goto fail;
1165
1166 if (log_get_max_level() < LOG_DEBUG)
1167 flags |= PAM_SILENT;
1168
1169 pam_code = pam_start(name, user, &conv, &handle);
1170 if (pam_code != PAM_SUCCESS) {
1171 handle = NULL;
1172 goto fail;
1173 }
1174
1175 if (!tty) {
1176 _cleanup_free_ char *q = NULL;
1177
1178 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1179 * out if that's the case, and read the TTY off it. */
1180
1181 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1182 tty = strjoina("/dev/", q);
1183 }
1184
1185 if (tty) {
1186 pam_code = pam_set_item(handle, PAM_TTY, tty);
1187 if (pam_code != PAM_SUCCESS)
1188 goto fail;
1189 }
1190
1191 STRV_FOREACH(nv, *env) {
1192 pam_code = pam_putenv(handle, *nv);
1193 if (pam_code != PAM_SUCCESS)
1194 goto fail;
1195 }
1196
1197 pam_code = pam_acct_mgmt(handle, flags);
1198 if (pam_code != PAM_SUCCESS)
1199 goto fail;
1200
1201 pam_code = pam_open_session(handle, flags);
1202 if (pam_code != PAM_SUCCESS)
1203 goto fail;
1204
1205 close_session = true;
1206
1207 e = pam_getenvlist(handle);
1208 if (!e) {
1209 pam_code = PAM_BUF_ERR;
1210 goto fail;
1211 }
1212
1213 /* Block SIGTERM, so that we know that it won't get lost in
1214 * the child */
1215
1216 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1217
1218 parent_pid = getpid_cached();
1219
1220 r = safe_fork("(sd-pam)", 0, &pam_pid);
1221 if (r < 0)
1222 goto fail;
1223 if (r == 0) {
1224 int sig, ret = EXIT_PAM;
1225
1226 /* The child's job is to reset the PAM session on
1227 * termination */
1228 barrier_set_role(&barrier, BARRIER_CHILD);
1229
1230 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1231 * are open here that have been opened by PAM. */
1232 (void) close_many(fds, n_fds);
1233
1234 /* Drop privileges - we don't need any to pam_close_session
1235 * and this will make PR_SET_PDEATHSIG work in most cases.
1236 * If this fails, ignore the error - but expect sd-pam threads
1237 * to fail to exit normally */
1238
1239 r = maybe_setgroups(0, NULL);
1240 if (r < 0)
1241 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1242 if (setresgid(gid, gid, gid) < 0)
1243 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1244 if (setresuid(uid, uid, uid) < 0)
1245 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1246
1247 (void) ignore_signals(SIGPIPE, -1);
1248
1249 /* Wait until our parent died. This will only work if
1250 * the above setresuid() succeeds, otherwise the kernel
1251 * will not allow unprivileged parents kill their privileged
1252 * children this way. We rely on the control groups kill logic
1253 * to do the rest for us. */
1254 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1255 goto child_finish;
1256
1257 /* Tell the parent that our setup is done. This is especially
1258 * important regarding dropping privileges. Otherwise, unit
1259 * setup might race against our setresuid(2) call.
1260 *
1261 * If the parent aborted, we'll detect this below, hence ignore
1262 * return failure here. */
1263 (void) barrier_place(&barrier);
1264
1265 /* Check if our parent process might already have died? */
1266 if (getppid() == parent_pid) {
1267 sigset_t ss;
1268
1269 assert_se(sigemptyset(&ss) >= 0);
1270 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1271
1272 for (;;) {
1273 if (sigwait(&ss, &sig) < 0) {
1274 if (errno == EINTR)
1275 continue;
1276
1277 goto child_finish;
1278 }
1279
1280 assert(sig == SIGTERM);
1281 break;
1282 }
1283 }
1284
1285 /* If our parent died we'll end the session */
1286 if (getppid() != parent_pid) {
1287 pam_code = pam_close_session(handle, flags);
1288 if (pam_code != PAM_SUCCESS)
1289 goto child_finish;
1290 }
1291
1292 ret = 0;
1293
1294 child_finish:
1295 pam_end(handle, pam_code | flags);
1296 _exit(ret);
1297 }
1298
1299 barrier_set_role(&barrier, BARRIER_PARENT);
1300
1301 /* If the child was forked off successfully it will do all the
1302 * cleanups, so forget about the handle here. */
1303 handle = NULL;
1304
1305 /* Unblock SIGTERM again in the parent */
1306 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1307
1308 /* We close the log explicitly here, since the PAM modules
1309 * might have opened it, but we don't want this fd around. */
1310 closelog();
1311
1312 /* Synchronously wait for the child to initialize. We don't care for
1313 * errors as we cannot recover. However, warn loudly if it happens. */
1314 if (!barrier_place_and_sync(&barrier))
1315 log_error("PAM initialization failed");
1316
1317 return strv_free_and_replace(*env, e);
1318
1319 fail:
1320 if (pam_code != PAM_SUCCESS) {
1321 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1322 r = -EPERM; /* PAM errors do not map to errno */
1323 } else
1324 log_error_errno(r, "PAM failed: %m");
1325
1326 if (handle) {
1327 if (close_session)
1328 pam_code = pam_close_session(handle, flags);
1329
1330 pam_end(handle, pam_code | flags);
1331 }
1332
1333 strv_free(e);
1334 closelog();
1335
1336 return r;
1337 #else
1338 return 0;
1339 #endif
1340 }
1341
1342 static void rename_process_from_path(const char *path) {
1343 char process_name[11];
1344 const char *p;
1345 size_t l;
1346
1347 /* This resulting string must fit in 10 chars (i.e. the length
1348 * of "/sbin/init") to look pretty in /bin/ps */
1349
1350 p = basename(path);
1351 if (isempty(p)) {
1352 rename_process("(...)");
1353 return;
1354 }
1355
1356 l = strlen(p);
1357 if (l > 8) {
1358 /* The end of the process name is usually more
1359 * interesting, since the first bit might just be
1360 * "systemd-" */
1361 p = p + l - 8;
1362 l = 8;
1363 }
1364
1365 process_name[0] = '(';
1366 memcpy(process_name+1, p, l);
1367 process_name[1+l] = ')';
1368 process_name[1+l+1] = 0;
1369
1370 rename_process(process_name);
1371 }
1372
1373 static bool context_has_address_families(const ExecContext *c) {
1374 assert(c);
1375
1376 return c->address_families_whitelist ||
1377 !set_isempty(c->address_families);
1378 }
1379
1380 static bool context_has_syscall_filters(const ExecContext *c) {
1381 assert(c);
1382
1383 return c->syscall_whitelist ||
1384 !hashmap_isempty(c->syscall_filter);
1385 }
1386
1387 static bool context_has_no_new_privileges(const ExecContext *c) {
1388 assert(c);
1389
1390 if (c->no_new_privileges)
1391 return true;
1392
1393 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1394 return false;
1395
1396 /* We need NNP if we have any form of seccomp and are unprivileged */
1397 return context_has_address_families(c) ||
1398 c->memory_deny_write_execute ||
1399 c->restrict_realtime ||
1400 c->restrict_suid_sgid ||
1401 exec_context_restrict_namespaces_set(c) ||
1402 c->protect_kernel_tunables ||
1403 c->protect_kernel_modules ||
1404 c->private_devices ||
1405 context_has_syscall_filters(c) ||
1406 !set_isempty(c->syscall_archs) ||
1407 c->lock_personality ||
1408 c->protect_hostname;
1409 }
1410
1411 #if HAVE_SECCOMP
1412
1413 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1414
1415 if (is_seccomp_available())
1416 return false;
1417
1418 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1419 return true;
1420 }
1421
1422 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1423 uint32_t negative_action, default_action, action;
1424 int r;
1425
1426 assert(u);
1427 assert(c);
1428
1429 if (!context_has_syscall_filters(c))
1430 return 0;
1431
1432 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1433 return 0;
1434
1435 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1436
1437 if (c->syscall_whitelist) {
1438 default_action = negative_action;
1439 action = SCMP_ACT_ALLOW;
1440 } else {
1441 default_action = SCMP_ACT_ALLOW;
1442 action = negative_action;
1443 }
1444
1445 if (needs_ambient_hack) {
1446 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1447 if (r < 0)
1448 return r;
1449 }
1450
1451 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1452 }
1453
1454 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1455 assert(u);
1456 assert(c);
1457
1458 if (set_isempty(c->syscall_archs))
1459 return 0;
1460
1461 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1462 return 0;
1463
1464 return seccomp_restrict_archs(c->syscall_archs);
1465 }
1466
1467 static int apply_address_families(const Unit* u, const ExecContext *c) {
1468 assert(u);
1469 assert(c);
1470
1471 if (!context_has_address_families(c))
1472 return 0;
1473
1474 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1475 return 0;
1476
1477 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1478 }
1479
1480 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1481 assert(u);
1482 assert(c);
1483
1484 if (!c->memory_deny_write_execute)
1485 return 0;
1486
1487 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1488 return 0;
1489
1490 return seccomp_memory_deny_write_execute();
1491 }
1492
1493 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1494 assert(u);
1495 assert(c);
1496
1497 if (!c->restrict_realtime)
1498 return 0;
1499
1500 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1501 return 0;
1502
1503 return seccomp_restrict_realtime();
1504 }
1505
1506 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1507 assert(u);
1508 assert(c);
1509
1510 if (!c->restrict_suid_sgid)
1511 return 0;
1512
1513 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1514 return 0;
1515
1516 return seccomp_restrict_suid_sgid();
1517 }
1518
1519 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1520 assert(u);
1521 assert(c);
1522
1523 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1524 * let's protect even those systems where this is left on in the kernel. */
1525
1526 if (!c->protect_kernel_tunables)
1527 return 0;
1528
1529 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1530 return 0;
1531
1532 return seccomp_protect_sysctl();
1533 }
1534
1535 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1536 assert(u);
1537 assert(c);
1538
1539 /* Turn off module syscalls on ProtectKernelModules=yes */
1540
1541 if (!c->protect_kernel_modules)
1542 return 0;
1543
1544 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1545 return 0;
1546
1547 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1548 }
1549
1550 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1551 assert(u);
1552 assert(c);
1553
1554 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1555
1556 if (!c->private_devices)
1557 return 0;
1558
1559 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1560 return 0;
1561
1562 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1563 }
1564
1565 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1566 assert(u);
1567 assert(c);
1568
1569 if (!exec_context_restrict_namespaces_set(c))
1570 return 0;
1571
1572 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1573 return 0;
1574
1575 return seccomp_restrict_namespaces(c->restrict_namespaces);
1576 }
1577
1578 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1579 unsigned long personality;
1580 int r;
1581
1582 assert(u);
1583 assert(c);
1584
1585 if (!c->lock_personality)
1586 return 0;
1587
1588 if (skip_seccomp_unavailable(u, "LockPersonality="))
1589 return 0;
1590
1591 personality = c->personality;
1592
1593 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1594 if (personality == PERSONALITY_INVALID) {
1595
1596 r = opinionated_personality(&personality);
1597 if (r < 0)
1598 return r;
1599 }
1600
1601 return seccomp_lock_personality(personality);
1602 }
1603
1604 #endif
1605
1606 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1607 assert(idle_pipe);
1608
1609 idle_pipe[1] = safe_close(idle_pipe[1]);
1610 idle_pipe[2] = safe_close(idle_pipe[2]);
1611
1612 if (idle_pipe[0] >= 0) {
1613 int r;
1614
1615 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1616
1617 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1618 ssize_t n;
1619
1620 /* Signal systemd that we are bored and want to continue. */
1621 n = write(idle_pipe[3], "x", 1);
1622 if (n > 0)
1623 /* Wait for systemd to react to the signal above. */
1624 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1625 }
1626
1627 idle_pipe[0] = safe_close(idle_pipe[0]);
1628
1629 }
1630
1631 idle_pipe[3] = safe_close(idle_pipe[3]);
1632 }
1633
1634 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1635
1636 static int build_environment(
1637 const Unit *u,
1638 const ExecContext *c,
1639 const ExecParameters *p,
1640 size_t n_fds,
1641 const char *home,
1642 const char *username,
1643 const char *shell,
1644 dev_t journal_stream_dev,
1645 ino_t journal_stream_ino,
1646 char ***ret) {
1647
1648 _cleanup_strv_free_ char **our_env = NULL;
1649 ExecDirectoryType t;
1650 size_t n_env = 0;
1651 char *x;
1652
1653 assert(u);
1654 assert(c);
1655 assert(p);
1656 assert(ret);
1657
1658 our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1659 if (!our_env)
1660 return -ENOMEM;
1661
1662 if (n_fds > 0) {
1663 _cleanup_free_ char *joined = NULL;
1664
1665 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1666 return -ENOMEM;
1667 our_env[n_env++] = x;
1668
1669 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1670 return -ENOMEM;
1671 our_env[n_env++] = x;
1672
1673 joined = strv_join(p->fd_names, ":");
1674 if (!joined)
1675 return -ENOMEM;
1676
1677 x = strjoin("LISTEN_FDNAMES=", joined);
1678 if (!x)
1679 return -ENOMEM;
1680 our_env[n_env++] = x;
1681 }
1682
1683 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1684 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1685 return -ENOMEM;
1686 our_env[n_env++] = x;
1687
1688 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1689 return -ENOMEM;
1690 our_env[n_env++] = x;
1691 }
1692
1693 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1694 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1695 * check the database directly. */
1696 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1697 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1698 if (!x)
1699 return -ENOMEM;
1700 our_env[n_env++] = x;
1701 }
1702
1703 if (home) {
1704 x = strjoin("HOME=", home);
1705 if (!x)
1706 return -ENOMEM;
1707
1708 path_simplify(x + 5, true);
1709 our_env[n_env++] = x;
1710 }
1711
1712 if (username) {
1713 x = strjoin("LOGNAME=", username);
1714 if (!x)
1715 return -ENOMEM;
1716 our_env[n_env++] = x;
1717
1718 x = strjoin("USER=", username);
1719 if (!x)
1720 return -ENOMEM;
1721 our_env[n_env++] = x;
1722 }
1723
1724 if (shell) {
1725 x = strjoin("SHELL=", shell);
1726 if (!x)
1727 return -ENOMEM;
1728
1729 path_simplify(x + 6, true);
1730 our_env[n_env++] = x;
1731 }
1732
1733 if (!sd_id128_is_null(u->invocation_id)) {
1734 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1735 return -ENOMEM;
1736
1737 our_env[n_env++] = x;
1738 }
1739
1740 if (exec_context_needs_term(c)) {
1741 const char *tty_path, *term = NULL;
1742
1743 tty_path = exec_context_tty_path(c);
1744
1745 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1746 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1747 * passes to PID 1 ends up all the way in the console login shown. */
1748
1749 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1750 term = getenv("TERM");
1751 if (!term)
1752 term = default_term_for_tty(tty_path);
1753
1754 x = strjoin("TERM=", term);
1755 if (!x)
1756 return -ENOMEM;
1757 our_env[n_env++] = x;
1758 }
1759
1760 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1761 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1762 return -ENOMEM;
1763
1764 our_env[n_env++] = x;
1765 }
1766
1767 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1768 _cleanup_free_ char *pre = NULL, *joined = NULL;
1769 const char *n;
1770
1771 if (!p->prefix[t])
1772 continue;
1773
1774 if (strv_isempty(c->directories[t].paths))
1775 continue;
1776
1777 n = exec_directory_env_name_to_string(t);
1778 if (!n)
1779 continue;
1780
1781 pre = strjoin(p->prefix[t], "/");
1782 if (!pre)
1783 return -ENOMEM;
1784
1785 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1786 if (!joined)
1787 return -ENOMEM;
1788
1789 x = strjoin(n, "=", joined);
1790 if (!x)
1791 return -ENOMEM;
1792
1793 our_env[n_env++] = x;
1794 }
1795
1796 our_env[n_env++] = NULL;
1797 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1798
1799 *ret = TAKE_PTR(our_env);
1800
1801 return 0;
1802 }
1803
1804 static int build_pass_environment(const ExecContext *c, char ***ret) {
1805 _cleanup_strv_free_ char **pass_env = NULL;
1806 size_t n_env = 0, n_bufsize = 0;
1807 char **i;
1808
1809 STRV_FOREACH(i, c->pass_environment) {
1810 _cleanup_free_ char *x = NULL;
1811 char *v;
1812
1813 v = getenv(*i);
1814 if (!v)
1815 continue;
1816 x = strjoin(*i, "=", v);
1817 if (!x)
1818 return -ENOMEM;
1819
1820 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1821 return -ENOMEM;
1822
1823 pass_env[n_env++] = TAKE_PTR(x);
1824 pass_env[n_env] = NULL;
1825 }
1826
1827 *ret = TAKE_PTR(pass_env);
1828
1829 return 0;
1830 }
1831
1832 static bool exec_needs_mount_namespace(
1833 const ExecContext *context,
1834 const ExecParameters *params,
1835 const ExecRuntime *runtime) {
1836
1837 assert(context);
1838 assert(params);
1839
1840 if (context->root_image)
1841 return true;
1842
1843 if (!strv_isempty(context->read_write_paths) ||
1844 !strv_isempty(context->read_only_paths) ||
1845 !strv_isempty(context->inaccessible_paths))
1846 return true;
1847
1848 if (context->n_bind_mounts > 0)
1849 return true;
1850
1851 if (context->n_temporary_filesystems > 0)
1852 return true;
1853
1854 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1855 return true;
1856
1857 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1858 return true;
1859
1860 if (context->private_devices ||
1861 context->private_mounts ||
1862 context->protect_system != PROTECT_SYSTEM_NO ||
1863 context->protect_home != PROTECT_HOME_NO ||
1864 context->protect_kernel_tunables ||
1865 context->protect_kernel_modules ||
1866 context->protect_control_groups)
1867 return true;
1868
1869 if (context->root_directory) {
1870 ExecDirectoryType t;
1871
1872 if (context->mount_apivfs)
1873 return true;
1874
1875 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1876 if (!params->prefix[t])
1877 continue;
1878
1879 if (!strv_isempty(context->directories[t].paths))
1880 return true;
1881 }
1882 }
1883
1884 if (context->dynamic_user &&
1885 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1886 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1887 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1888 return true;
1889
1890 return false;
1891 }
1892
1893 static int setup_private_users(uid_t uid, gid_t gid) {
1894 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1895 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1896 _cleanup_close_ int unshare_ready_fd = -1;
1897 _cleanup_(sigkill_waitp) pid_t pid = 0;
1898 uint64_t c = 1;
1899 ssize_t n;
1900 int r;
1901
1902 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1903 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1904 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1905 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1906 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1907 * continues execution normally. */
1908
1909 if (uid != 0 && uid_is_valid(uid)) {
1910 r = asprintf(&uid_map,
1911 "0 0 1\n" /* Map root → root */
1912 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1913 uid, uid);
1914 if (r < 0)
1915 return -ENOMEM;
1916 } else {
1917 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1918 if (!uid_map)
1919 return -ENOMEM;
1920 }
1921
1922 if (gid != 0 && gid_is_valid(gid)) {
1923 r = asprintf(&gid_map,
1924 "0 0 1\n" /* Map root → root */
1925 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1926 gid, gid);
1927 if (r < 0)
1928 return -ENOMEM;
1929 } else {
1930 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1931 if (!gid_map)
1932 return -ENOMEM;
1933 }
1934
1935 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1936 * namespace. */
1937 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1938 if (unshare_ready_fd < 0)
1939 return -errno;
1940
1941 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1942 * failed. */
1943 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1944 return -errno;
1945
1946 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1947 if (r < 0)
1948 return r;
1949 if (r == 0) {
1950 _cleanup_close_ int fd = -1;
1951 const char *a;
1952 pid_t ppid;
1953
1954 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1955 * here, after the parent opened its own user namespace. */
1956
1957 ppid = getppid();
1958 errno_pipe[0] = safe_close(errno_pipe[0]);
1959
1960 /* Wait until the parent unshared the user namespace */
1961 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1962 r = -errno;
1963 goto child_fail;
1964 }
1965
1966 /* Disable the setgroups() system call in the child user namespace, for good. */
1967 a = procfs_file_alloca(ppid, "setgroups");
1968 fd = open(a, O_WRONLY|O_CLOEXEC);
1969 if (fd < 0) {
1970 if (errno != ENOENT) {
1971 r = -errno;
1972 goto child_fail;
1973 }
1974
1975 /* If the file is missing the kernel is too old, let's continue anyway. */
1976 } else {
1977 if (write(fd, "deny\n", 5) < 0) {
1978 r = -errno;
1979 goto child_fail;
1980 }
1981
1982 fd = safe_close(fd);
1983 }
1984
1985 /* First write the GID map */
1986 a = procfs_file_alloca(ppid, "gid_map");
1987 fd = open(a, O_WRONLY|O_CLOEXEC);
1988 if (fd < 0) {
1989 r = -errno;
1990 goto child_fail;
1991 }
1992 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1993 r = -errno;
1994 goto child_fail;
1995 }
1996 fd = safe_close(fd);
1997
1998 /* The write the UID map */
1999 a = procfs_file_alloca(ppid, "uid_map");
2000 fd = open(a, O_WRONLY|O_CLOEXEC);
2001 if (fd < 0) {
2002 r = -errno;
2003 goto child_fail;
2004 }
2005 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2006 r = -errno;
2007 goto child_fail;
2008 }
2009
2010 _exit(EXIT_SUCCESS);
2011
2012 child_fail:
2013 (void) write(errno_pipe[1], &r, sizeof(r));
2014 _exit(EXIT_FAILURE);
2015 }
2016
2017 errno_pipe[1] = safe_close(errno_pipe[1]);
2018
2019 if (unshare(CLONE_NEWUSER) < 0)
2020 return -errno;
2021
2022 /* Let the child know that the namespace is ready now */
2023 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2024 return -errno;
2025
2026 /* Try to read an error code from the child */
2027 n = read(errno_pipe[0], &r, sizeof(r));
2028 if (n < 0)
2029 return -errno;
2030 if (n == sizeof(r)) { /* an error code was sent to us */
2031 if (r < 0)
2032 return r;
2033 return -EIO;
2034 }
2035 if (n != 0) /* on success we should have read 0 bytes */
2036 return -EIO;
2037
2038 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2039 pid = 0;
2040 if (r < 0)
2041 return r;
2042 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2043 return -EIO;
2044
2045 return 0;
2046 }
2047
2048 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2049 if (!context->dynamic_user)
2050 return false;
2051
2052 if (type == EXEC_DIRECTORY_CONFIGURATION)
2053 return false;
2054
2055 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2056 return false;
2057
2058 return true;
2059 }
2060
2061 static int setup_exec_directory(
2062 const ExecContext *context,
2063 const ExecParameters *params,
2064 uid_t uid,
2065 gid_t gid,
2066 ExecDirectoryType type,
2067 int *exit_status) {
2068
2069 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2070 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2071 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2072 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2073 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2074 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2075 };
2076 char **rt;
2077 int r;
2078
2079 assert(context);
2080 assert(params);
2081 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2082 assert(exit_status);
2083
2084 if (!params->prefix[type])
2085 return 0;
2086
2087 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2088 if (!uid_is_valid(uid))
2089 uid = 0;
2090 if (!gid_is_valid(gid))
2091 gid = 0;
2092 }
2093
2094 STRV_FOREACH(rt, context->directories[type].paths) {
2095 _cleanup_free_ char *p = NULL, *pp = NULL;
2096
2097 p = path_join(params->prefix[type], *rt);
2098 if (!p) {
2099 r = -ENOMEM;
2100 goto fail;
2101 }
2102
2103 r = mkdir_parents_label(p, 0755);
2104 if (r < 0)
2105 goto fail;
2106
2107 if (exec_directory_is_private(context, type)) {
2108 _cleanup_free_ char *private_root = NULL;
2109
2110 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2111 * case we want to avoid leaving a directory around fully accessible that is owned by
2112 * a dynamic user whose UID is later on reused. To lock this down we use the same
2113 * trick used by container managers to prohibit host users to get access to files of
2114 * the same UID in containers: we place everything inside a directory that has an
2115 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2116 * for unprivileged host code. We then use fs namespacing to make this directory
2117 * permeable for the service itself.
2118 *
2119 * Specifically: for a service which wants a special directory "foo/" we first create
2120 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2121 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2122 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2123 * unprivileged host users can't look into it. Inside of the namespace of the unit
2124 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2125 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2126 * for the service and making sure it only gets access to the dirs it needs but no
2127 * others. Tricky? Yes, absolutely, but it works!
2128 *
2129 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2130 * to be owned by the service itself.
2131 *
2132 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2133 * for sharing files or sockets with other services. */
2134
2135 private_root = path_join(params->prefix[type], "private");
2136 if (!private_root) {
2137 r = -ENOMEM;
2138 goto fail;
2139 }
2140
2141 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2142 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2143 if (r < 0)
2144 goto fail;
2145
2146 pp = path_join(private_root, *rt);
2147 if (!pp) {
2148 r = -ENOMEM;
2149 goto fail;
2150 }
2151
2152 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2153 r = mkdir_parents_label(pp, 0755);
2154 if (r < 0)
2155 goto fail;
2156
2157 if (is_dir(p, false) > 0 &&
2158 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2159
2160 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2161 * it over. Most likely the service has been upgraded from one that didn't use
2162 * DynamicUser=1, to one that does. */
2163
2164 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2165 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2166 exec_directory_type_to_string(type), p, pp);
2167
2168 if (rename(p, pp) < 0) {
2169 r = -errno;
2170 goto fail;
2171 }
2172 } else {
2173 /* Otherwise, create the actual directory for the service */
2174
2175 r = mkdir_label(pp, context->directories[type].mode);
2176 if (r < 0 && r != -EEXIST)
2177 goto fail;
2178 }
2179
2180 /* And link it up from the original place */
2181 r = symlink_idempotent(pp, p, true);
2182 if (r < 0)
2183 goto fail;
2184
2185 } else {
2186 _cleanup_free_ char *target = NULL;
2187
2188 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2189 readlink_and_make_absolute(p, &target) >= 0) {
2190 _cleanup_free_ char *q = NULL;
2191
2192 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193 * by DynamicUser=1 (see above)?
2194 *
2195 * We do this for all directory types except for ConfigurationDirectory=,
2196 * since they all support the private/ symlink logic at least in some
2197 * configurations, see above. */
2198
2199 q = path_join(params->prefix[type], "private", *rt);
2200 if (!q) {
2201 r = -ENOMEM;
2202 goto fail;
2203 }
2204
2205 if (path_equal(q, target)) {
2206
2207 /* Hmm, apparently DynamicUser= was once turned on for this service,
2208 * but is no longer. Let's move the directory back up. */
2209
2210 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2211 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2212 exec_directory_type_to_string(type), q, p);
2213
2214 if (unlink(p) < 0) {
2215 r = -errno;
2216 goto fail;
2217 }
2218
2219 if (rename(q, p) < 0) {
2220 r = -errno;
2221 goto fail;
2222 }
2223 }
2224 }
2225
2226 r = mkdir_label(p, context->directories[type].mode);
2227 if (r < 0) {
2228 if (r != -EEXIST)
2229 goto fail;
2230
2231 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2232 struct stat st;
2233
2234 /* Don't change the owner/access mode of the configuration directory,
2235 * as in the common case it is not written to by a service, and shall
2236 * not be writable. */
2237
2238 if (stat(p, &st) < 0) {
2239 r = -errno;
2240 goto fail;
2241 }
2242
2243 /* Still complain if the access mode doesn't match */
2244 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2245 log_warning("%s \'%s\' already exists but the mode is different. "
2246 "(File system: %o %sMode: %o)",
2247 exec_directory_type_to_string(type), *rt,
2248 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2249
2250 continue;
2251 }
2252 }
2253 }
2254
2255 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2256 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2257 * current UID/GID ownership.) */
2258 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2259 if (r < 0)
2260 goto fail;
2261
2262 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2263 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2264 * assignments to exist.*/
2265 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2266 if (r < 0)
2267 goto fail;
2268 }
2269
2270 return 0;
2271
2272 fail:
2273 *exit_status = exit_status_table[type];
2274 return r;
2275 }
2276
2277 #if ENABLE_SMACK
2278 static int setup_smack(
2279 const ExecContext *context,
2280 const ExecCommand *command) {
2281
2282 int r;
2283
2284 assert(context);
2285 assert(command);
2286
2287 if (context->smack_process_label) {
2288 r = mac_smack_apply_pid(0, context->smack_process_label);
2289 if (r < 0)
2290 return r;
2291 }
2292 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2293 else {
2294 _cleanup_free_ char *exec_label = NULL;
2295
2296 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2297 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2298 return r;
2299
2300 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2301 if (r < 0)
2302 return r;
2303 }
2304 #endif
2305
2306 return 0;
2307 }
2308 #endif
2309
2310 static int compile_bind_mounts(
2311 const ExecContext *context,
2312 const ExecParameters *params,
2313 BindMount **ret_bind_mounts,
2314 size_t *ret_n_bind_mounts,
2315 char ***ret_empty_directories) {
2316
2317 _cleanup_strv_free_ char **empty_directories = NULL;
2318 BindMount *bind_mounts;
2319 size_t n, h = 0, i;
2320 ExecDirectoryType t;
2321 int r;
2322
2323 assert(context);
2324 assert(params);
2325 assert(ret_bind_mounts);
2326 assert(ret_n_bind_mounts);
2327 assert(ret_empty_directories);
2328
2329 n = context->n_bind_mounts;
2330 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2331 if (!params->prefix[t])
2332 continue;
2333
2334 n += strv_length(context->directories[t].paths);
2335 }
2336
2337 if (n <= 0) {
2338 *ret_bind_mounts = NULL;
2339 *ret_n_bind_mounts = 0;
2340 *ret_empty_directories = NULL;
2341 return 0;
2342 }
2343
2344 bind_mounts = new(BindMount, n);
2345 if (!bind_mounts)
2346 return -ENOMEM;
2347
2348 for (i = 0; i < context->n_bind_mounts; i++) {
2349 BindMount *item = context->bind_mounts + i;
2350 char *s, *d;
2351
2352 s = strdup(item->source);
2353 if (!s) {
2354 r = -ENOMEM;
2355 goto finish;
2356 }
2357
2358 d = strdup(item->destination);
2359 if (!d) {
2360 free(s);
2361 r = -ENOMEM;
2362 goto finish;
2363 }
2364
2365 bind_mounts[h++] = (BindMount) {
2366 .source = s,
2367 .destination = d,
2368 .read_only = item->read_only,
2369 .recursive = item->recursive,
2370 .ignore_enoent = item->ignore_enoent,
2371 };
2372 }
2373
2374 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2375 char **suffix;
2376
2377 if (!params->prefix[t])
2378 continue;
2379
2380 if (strv_isempty(context->directories[t].paths))
2381 continue;
2382
2383 if (exec_directory_is_private(context, t) &&
2384 !(context->root_directory || context->root_image)) {
2385 char *private_root;
2386
2387 /* So this is for a dynamic user, and we need to make sure the process can access its own
2388 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2389 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2390
2391 private_root = path_join(params->prefix[t], "private");
2392 if (!private_root) {
2393 r = -ENOMEM;
2394 goto finish;
2395 }
2396
2397 r = strv_consume(&empty_directories, private_root);
2398 if (r < 0)
2399 goto finish;
2400 }
2401
2402 STRV_FOREACH(suffix, context->directories[t].paths) {
2403 char *s, *d;
2404
2405 if (exec_directory_is_private(context, t))
2406 s = path_join(params->prefix[t], "private", *suffix);
2407 else
2408 s = path_join(params->prefix[t], *suffix);
2409 if (!s) {
2410 r = -ENOMEM;
2411 goto finish;
2412 }
2413
2414 if (exec_directory_is_private(context, t) &&
2415 (context->root_directory || context->root_image))
2416 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2417 * directory is not created on the root directory. So, let's bind-mount the directory
2418 * on the 'non-private' place. */
2419 d = path_join(params->prefix[t], *suffix);
2420 else
2421 d = strdup(s);
2422 if (!d) {
2423 free(s);
2424 r = -ENOMEM;
2425 goto finish;
2426 }
2427
2428 bind_mounts[h++] = (BindMount) {
2429 .source = s,
2430 .destination = d,
2431 .read_only = false,
2432 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2433 .recursive = true,
2434 .ignore_enoent = false,
2435 };
2436 }
2437 }
2438
2439 assert(h == n);
2440
2441 *ret_bind_mounts = bind_mounts;
2442 *ret_n_bind_mounts = n;
2443 *ret_empty_directories = TAKE_PTR(empty_directories);
2444
2445 return (int) n;
2446
2447 finish:
2448 bind_mount_free_many(bind_mounts, h);
2449 return r;
2450 }
2451
2452 static int apply_mount_namespace(
2453 const Unit *u,
2454 const ExecCommand *command,
2455 const ExecContext *context,
2456 const ExecParameters *params,
2457 const ExecRuntime *runtime,
2458 char **error_path) {
2459
2460 _cleanup_strv_free_ char **empty_directories = NULL;
2461 char *tmp = NULL, *var = NULL;
2462 const char *root_dir = NULL, *root_image = NULL;
2463 NamespaceInfo ns_info;
2464 bool needs_sandboxing;
2465 BindMount *bind_mounts = NULL;
2466 size_t n_bind_mounts = 0;
2467 int r;
2468
2469 assert(context);
2470
2471 /* The runtime struct only contains the parent of the private /tmp,
2472 * which is non-accessible to world users. Inside of it there's a /tmp
2473 * that is sticky, and that's the one we want to use here. */
2474
2475 if (context->private_tmp && runtime) {
2476 if (runtime->tmp_dir)
2477 tmp = strjoina(runtime->tmp_dir, "/tmp");
2478 if (runtime->var_tmp_dir)
2479 var = strjoina(runtime->var_tmp_dir, "/tmp");
2480 }
2481
2482 if (params->flags & EXEC_APPLY_CHROOT) {
2483 root_image = context->root_image;
2484
2485 if (!root_image)
2486 root_dir = context->root_directory;
2487 }
2488
2489 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2490 if (r < 0)
2491 return r;
2492
2493 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2494 if (needs_sandboxing)
2495 ns_info = (NamespaceInfo) {
2496 .ignore_protect_paths = false,
2497 .private_dev = context->private_devices,
2498 .protect_control_groups = context->protect_control_groups,
2499 .protect_kernel_tunables = context->protect_kernel_tunables,
2500 .protect_kernel_modules = context->protect_kernel_modules,
2501 .protect_hostname = context->protect_hostname,
2502 .mount_apivfs = context->mount_apivfs,
2503 .private_mounts = context->private_mounts,
2504 };
2505 else if (!context->dynamic_user && root_dir)
2506 /*
2507 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2508 * sandbox info, otherwise enforce it, don't ignore protected paths and
2509 * fail if we are enable to apply the sandbox inside the mount namespace.
2510 */
2511 ns_info = (NamespaceInfo) {
2512 .ignore_protect_paths = true,
2513 };
2514 else
2515 ns_info = (NamespaceInfo) {};
2516
2517 if (context->mount_flags == MS_SHARED)
2518 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2519
2520 r = setup_namespace(root_dir, root_image,
2521 &ns_info, context->read_write_paths,
2522 needs_sandboxing ? context->read_only_paths : NULL,
2523 needs_sandboxing ? context->inaccessible_paths : NULL,
2524 empty_directories,
2525 bind_mounts,
2526 n_bind_mounts,
2527 context->temporary_filesystems,
2528 context->n_temporary_filesystems,
2529 tmp,
2530 var,
2531 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2532 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2533 context->mount_flags,
2534 DISSECT_IMAGE_DISCARD_ON_LOOP,
2535 error_path);
2536
2537 bind_mount_free_many(bind_mounts, n_bind_mounts);
2538
2539 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2540 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2541 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2542 * completely different execution environment. */
2543 if (r == -ENOANO) {
2544 if (n_bind_mounts == 0 &&
2545 context->n_temporary_filesystems == 0 &&
2546 !root_dir && !root_image &&
2547 !context->dynamic_user) {
2548 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2549 return 0;
2550 }
2551
2552 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2553 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2554 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2555
2556 return -EOPNOTSUPP;
2557 }
2558
2559 return r;
2560 }
2561
2562 static int apply_working_directory(
2563 const ExecContext *context,
2564 const ExecParameters *params,
2565 const char *home,
2566 int *exit_status) {
2567
2568 const char *d, *wd;
2569
2570 assert(context);
2571 assert(exit_status);
2572
2573 if (context->working_directory_home) {
2574
2575 if (!home) {
2576 *exit_status = EXIT_CHDIR;
2577 return -ENXIO;
2578 }
2579
2580 wd = home;
2581
2582 } else if (context->working_directory)
2583 wd = context->working_directory;
2584 else
2585 wd = "/";
2586
2587 if (params->flags & EXEC_APPLY_CHROOT)
2588 d = wd;
2589 else
2590 d = prefix_roota(context->root_directory, wd);
2591
2592 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2593 *exit_status = EXIT_CHDIR;
2594 return -errno;
2595 }
2596
2597 return 0;
2598 }
2599
2600 static int apply_root_directory(
2601 const ExecContext *context,
2602 const ExecParameters *params,
2603 const bool needs_mount_ns,
2604 int *exit_status) {
2605
2606 assert(context);
2607 assert(exit_status);
2608
2609 if (params->flags & EXEC_APPLY_CHROOT) {
2610 if (!needs_mount_ns && context->root_directory)
2611 if (chroot(context->root_directory) < 0) {
2612 *exit_status = EXIT_CHROOT;
2613 return -errno;
2614 }
2615 }
2616
2617 return 0;
2618 }
2619
2620 static int setup_keyring(
2621 const Unit *u,
2622 const ExecContext *context,
2623 const ExecParameters *p,
2624 uid_t uid, gid_t gid) {
2625
2626 key_serial_t keyring;
2627 int r = 0;
2628 uid_t saved_uid;
2629 gid_t saved_gid;
2630
2631 assert(u);
2632 assert(context);
2633 assert(p);
2634
2635 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2636 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2637 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2638 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2639 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2640 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2641
2642 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2643 return 0;
2644
2645 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2646 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2647 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2648 * & group is just as nasty as acquiring a reference to the user keyring. */
2649
2650 saved_uid = getuid();
2651 saved_gid = getgid();
2652
2653 if (gid_is_valid(gid) && gid != saved_gid) {
2654 if (setregid(gid, -1) < 0)
2655 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2656 }
2657
2658 if (uid_is_valid(uid) && uid != saved_uid) {
2659 if (setreuid(uid, -1) < 0) {
2660 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2661 goto out;
2662 }
2663 }
2664
2665 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2666 if (keyring == -1) {
2667 if (errno == ENOSYS)
2668 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2669 else if (IN_SET(errno, EACCES, EPERM))
2670 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2671 else if (errno == EDQUOT)
2672 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2673 else
2674 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2675
2676 goto out;
2677 }
2678
2679 /* When requested link the user keyring into the session keyring. */
2680 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2681
2682 if (keyctl(KEYCTL_LINK,
2683 KEY_SPEC_USER_KEYRING,
2684 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2685 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2686 goto out;
2687 }
2688 }
2689
2690 /* Restore uid/gid back */
2691 if (uid_is_valid(uid) && uid != saved_uid) {
2692 if (setreuid(saved_uid, -1) < 0) {
2693 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2694 goto out;
2695 }
2696 }
2697
2698 if (gid_is_valid(gid) && gid != saved_gid) {
2699 if (setregid(saved_gid, -1) < 0)
2700 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2701 }
2702
2703 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2704 if (!sd_id128_is_null(u->invocation_id)) {
2705 key_serial_t key;
2706
2707 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2708 if (key == -1)
2709 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2710 else {
2711 if (keyctl(KEYCTL_SETPERM, key,
2712 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2713 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2714 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2715 }
2716 }
2717
2718 out:
2719 /* Revert back uid & gid for the the last time, and exit */
2720 /* no extra logging, as only the first already reported error matters */
2721 if (getuid() != saved_uid)
2722 (void) setreuid(saved_uid, -1);
2723
2724 if (getgid() != saved_gid)
2725 (void) setregid(saved_gid, -1);
2726
2727 return r;
2728 }
2729
2730 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2731 assert(array);
2732 assert(n);
2733 assert(pair);
2734
2735 if (pair[0] >= 0)
2736 array[(*n)++] = pair[0];
2737 if (pair[1] >= 0)
2738 array[(*n)++] = pair[1];
2739 }
2740
2741 static int close_remaining_fds(
2742 const ExecParameters *params,
2743 const ExecRuntime *runtime,
2744 const DynamicCreds *dcreds,
2745 int user_lookup_fd,
2746 int socket_fd,
2747 int exec_fd,
2748 int *fds, size_t n_fds) {
2749
2750 size_t n_dont_close = 0;
2751 int dont_close[n_fds + 12];
2752
2753 assert(params);
2754
2755 if (params->stdin_fd >= 0)
2756 dont_close[n_dont_close++] = params->stdin_fd;
2757 if (params->stdout_fd >= 0)
2758 dont_close[n_dont_close++] = params->stdout_fd;
2759 if (params->stderr_fd >= 0)
2760 dont_close[n_dont_close++] = params->stderr_fd;
2761
2762 if (socket_fd >= 0)
2763 dont_close[n_dont_close++] = socket_fd;
2764 if (exec_fd >= 0)
2765 dont_close[n_dont_close++] = exec_fd;
2766 if (n_fds > 0) {
2767 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2768 n_dont_close += n_fds;
2769 }
2770
2771 if (runtime)
2772 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2773
2774 if (dcreds) {
2775 if (dcreds->user)
2776 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2777 if (dcreds->group)
2778 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2779 }
2780
2781 if (user_lookup_fd >= 0)
2782 dont_close[n_dont_close++] = user_lookup_fd;
2783
2784 return close_all_fds(dont_close, n_dont_close);
2785 }
2786
2787 static int send_user_lookup(
2788 Unit *unit,
2789 int user_lookup_fd,
2790 uid_t uid,
2791 gid_t gid) {
2792
2793 assert(unit);
2794
2795 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2796 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2797 * specified. */
2798
2799 if (user_lookup_fd < 0)
2800 return 0;
2801
2802 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2803 return 0;
2804
2805 if (writev(user_lookup_fd,
2806 (struct iovec[]) {
2807 IOVEC_INIT(&uid, sizeof(uid)),
2808 IOVEC_INIT(&gid, sizeof(gid)),
2809 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2810 return -errno;
2811
2812 return 0;
2813 }
2814
2815 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2816 int r;
2817
2818 assert(c);
2819 assert(home);
2820 assert(buf);
2821
2822 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2823
2824 if (*home)
2825 return 0;
2826
2827 if (!c->working_directory_home)
2828 return 0;
2829
2830 r = get_home_dir(buf);
2831 if (r < 0)
2832 return r;
2833
2834 *home = *buf;
2835 return 1;
2836 }
2837
2838 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2839 _cleanup_strv_free_ char ** list = NULL;
2840 ExecDirectoryType t;
2841 int r;
2842
2843 assert(c);
2844 assert(p);
2845 assert(ret);
2846
2847 assert(c->dynamic_user);
2848
2849 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2850 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2851 * directories. */
2852
2853 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2854 char **i;
2855
2856 if (t == EXEC_DIRECTORY_CONFIGURATION)
2857 continue;
2858
2859 if (!p->prefix[t])
2860 continue;
2861
2862 STRV_FOREACH(i, c->directories[t].paths) {
2863 char *e;
2864
2865 if (exec_directory_is_private(c, t))
2866 e = path_join(p->prefix[t], "private", *i);
2867 else
2868 e = path_join(p->prefix[t], *i);
2869 if (!e)
2870 return -ENOMEM;
2871
2872 r = strv_consume(&list, e);
2873 if (r < 0)
2874 return r;
2875 }
2876 }
2877
2878 *ret = TAKE_PTR(list);
2879
2880 return 0;
2881 }
2882
2883 static char *exec_command_line(char **argv);
2884
2885 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2886 bool using_subcgroup;
2887 char *p;
2888
2889 assert(params);
2890 assert(ret);
2891
2892 if (!params->cgroup_path)
2893 return -EINVAL;
2894
2895 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2896 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2897 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2898 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2899 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2900 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2901 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2902 * flag, which is only passed for the former statements, not for the latter. */
2903
2904 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2905 if (using_subcgroup)
2906 p = path_join(params->cgroup_path, ".control");
2907 else
2908 p = strdup(params->cgroup_path);
2909 if (!p)
2910 return -ENOMEM;
2911
2912 *ret = p;
2913 return using_subcgroup;
2914 }
2915
2916 static int exec_child(
2917 Unit *unit,
2918 const ExecCommand *command,
2919 const ExecContext *context,
2920 const ExecParameters *params,
2921 ExecRuntime *runtime,
2922 DynamicCreds *dcreds,
2923 int socket_fd,
2924 const int named_iofds[static 3],
2925 int *fds,
2926 size_t n_socket_fds,
2927 size_t n_storage_fds,
2928 char **files_env,
2929 int user_lookup_fd,
2930 int *exit_status) {
2931
2932 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
2933 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2934 _cleanup_free_ gid_t *supplementary_gids = NULL;
2935 const char *username = NULL, *groupname = NULL;
2936 _cleanup_free_ char *home_buffer = NULL;
2937 const char *home = NULL, *shell = NULL;
2938 char **final_argv = NULL;
2939 dev_t journal_stream_dev = 0;
2940 ino_t journal_stream_ino = 0;
2941 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2942 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2943 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2944 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2945 #if HAVE_SELINUX
2946 _cleanup_free_ char *mac_selinux_context_net = NULL;
2947 bool use_selinux = false;
2948 #endif
2949 #if ENABLE_SMACK
2950 bool use_smack = false;
2951 #endif
2952 #if HAVE_APPARMOR
2953 bool use_apparmor = false;
2954 #endif
2955 uid_t uid = UID_INVALID;
2956 gid_t gid = GID_INVALID;
2957 size_t n_fds;
2958 ExecDirectoryType dt;
2959 int secure_bits;
2960
2961 assert(unit);
2962 assert(command);
2963 assert(context);
2964 assert(params);
2965 assert(exit_status);
2966
2967 rename_process_from_path(command->path);
2968
2969 /* We reset exactly these signals, since they are the
2970 * only ones we set to SIG_IGN in the main daemon. All
2971 * others we leave untouched because we set them to
2972 * SIG_DFL or a valid handler initially, both of which
2973 * will be demoted to SIG_DFL. */
2974 (void) default_signals(SIGNALS_CRASH_HANDLER,
2975 SIGNALS_IGNORE, -1);
2976
2977 if (context->ignore_sigpipe)
2978 (void) ignore_signals(SIGPIPE, -1);
2979
2980 r = reset_signal_mask();
2981 if (r < 0) {
2982 *exit_status = EXIT_SIGNAL_MASK;
2983 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2984 }
2985
2986 if (params->idle_pipe)
2987 do_idle_pipe_dance(params->idle_pipe);
2988
2989 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2990 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2991 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2992 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2993
2994 log_forget_fds();
2995 log_set_open_when_needed(true);
2996
2997 /* In case anything used libc syslog(), close this here, too */
2998 closelog();
2999
3000 n_fds = n_socket_fds + n_storage_fds;
3001 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3002 if (r < 0) {
3003 *exit_status = EXIT_FDS;
3004 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3005 }
3006
3007 if (!context->same_pgrp)
3008 if (setsid() < 0) {
3009 *exit_status = EXIT_SETSID;
3010 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3011 }
3012
3013 exec_context_tty_reset(context, params);
3014
3015 if (unit_shall_confirm_spawn(unit)) {
3016 const char *vc = params->confirm_spawn;
3017 _cleanup_free_ char *cmdline = NULL;
3018
3019 cmdline = exec_command_line(command->argv);
3020 if (!cmdline) {
3021 *exit_status = EXIT_MEMORY;
3022 return log_oom();
3023 }
3024
3025 r = ask_for_confirmation(vc, unit, cmdline);
3026 if (r != CONFIRM_EXECUTE) {
3027 if (r == CONFIRM_PRETEND_SUCCESS) {
3028 *exit_status = EXIT_SUCCESS;
3029 return 0;
3030 }
3031 *exit_status = EXIT_CONFIRM;
3032 log_unit_error(unit, "Execution cancelled by the user");
3033 return -ECANCELED;
3034 }
3035 }
3036
3037 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3038 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3039 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3040 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3041 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3042 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3043 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3044 *exit_status = EXIT_MEMORY;
3045 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3046 }
3047
3048 if (context->dynamic_user && dcreds) {
3049 _cleanup_strv_free_ char **suggested_paths = NULL;
3050
3051 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3052 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3053 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3054 *exit_status = EXIT_USER;
3055 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3056 }
3057
3058 r = compile_suggested_paths(context, params, &suggested_paths);
3059 if (r < 0) {
3060 *exit_status = EXIT_MEMORY;
3061 return log_oom();
3062 }
3063
3064 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3065 if (r < 0) {
3066 *exit_status = EXIT_USER;
3067 if (r == -EILSEQ) {
3068 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3069 return -EOPNOTSUPP;
3070 }
3071 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3072 }
3073
3074 if (!uid_is_valid(uid)) {
3075 *exit_status = EXIT_USER;
3076 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3077 return -ESRCH;
3078 }
3079
3080 if (!gid_is_valid(gid)) {
3081 *exit_status = EXIT_USER;
3082 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3083 return -ESRCH;
3084 }
3085
3086 if (dcreds->user)
3087 username = dcreds->user->name;
3088
3089 } else {
3090 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3091 if (r < 0) {
3092 *exit_status = EXIT_USER;
3093 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3094 }
3095
3096 r = get_fixed_group(context, &groupname, &gid);
3097 if (r < 0) {
3098 *exit_status = EXIT_GROUP;
3099 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3100 }
3101 }
3102
3103 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3104 r = get_supplementary_groups(context, username, groupname, gid,
3105 &supplementary_gids, &ngids);
3106 if (r < 0) {
3107 *exit_status = EXIT_GROUP;
3108 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3109 }
3110
3111 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3112 if (r < 0) {
3113 *exit_status = EXIT_USER;
3114 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3115 }
3116
3117 user_lookup_fd = safe_close(user_lookup_fd);
3118
3119 r = acquire_home(context, uid, &home, &home_buffer);
3120 if (r < 0) {
3121 *exit_status = EXIT_CHDIR;
3122 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3123 }
3124
3125 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3126 * must sure to drop O_NONBLOCK */
3127 if (socket_fd >= 0)
3128 (void) fd_nonblock(socket_fd, false);
3129
3130 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3131 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3132 if (params->cgroup_path) {
3133 _cleanup_free_ char *p = NULL;
3134
3135 r = exec_parameters_get_cgroup_path(params, &p);
3136 if (r < 0) {
3137 *exit_status = EXIT_CGROUP;
3138 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3139 }
3140
3141 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3142 if (r < 0) {
3143 *exit_status = EXIT_CGROUP;
3144 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3145 }
3146 }
3147
3148 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3149 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3150 if (r < 0) {
3151 *exit_status = EXIT_NETWORK;
3152 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3153 }
3154 }
3155
3156 r = setup_input(context, params, socket_fd, named_iofds);
3157 if (r < 0) {
3158 *exit_status = EXIT_STDIN;
3159 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3160 }
3161
3162 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3163 if (r < 0) {
3164 *exit_status = EXIT_STDOUT;
3165 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3166 }
3167
3168 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3169 if (r < 0) {
3170 *exit_status = EXIT_STDERR;
3171 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3172 }
3173
3174 if (context->oom_score_adjust_set) {
3175 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3176 * prohibit write access to this file, and we shouldn't trip up over that. */
3177 r = set_oom_score_adjust(context->oom_score_adjust);
3178 if (IN_SET(r, -EPERM, -EACCES))
3179 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3180 else if (r < 0) {
3181 *exit_status = EXIT_OOM_ADJUST;
3182 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3183 }
3184 }
3185
3186 if (context->nice_set)
3187 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3188 *exit_status = EXIT_NICE;
3189 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3190 }
3191
3192 if (context->cpu_sched_set) {
3193 struct sched_param param = {
3194 .sched_priority = context->cpu_sched_priority,
3195 };
3196
3197 r = sched_setscheduler(0,
3198 context->cpu_sched_policy |
3199 (context->cpu_sched_reset_on_fork ?
3200 SCHED_RESET_ON_FORK : 0),
3201 &param);
3202 if (r < 0) {
3203 *exit_status = EXIT_SETSCHEDULER;
3204 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3205 }
3206 }
3207
3208 if (context->cpu_set.set)
3209 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3210 *exit_status = EXIT_CPUAFFINITY;
3211 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3212 }
3213
3214 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3215 r = apply_numa_policy(&context->numa_policy);
3216 if (r == -EOPNOTSUPP)
3217 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3218 else if (r < 0) {
3219 *exit_status = EXIT_NUMA_POLICY;
3220 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3221 }
3222 }
3223
3224 if (context->ioprio_set)
3225 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3226 *exit_status = EXIT_IOPRIO;
3227 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3228 }
3229
3230 if (context->timer_slack_nsec != NSEC_INFINITY)
3231 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3232 *exit_status = EXIT_TIMERSLACK;
3233 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3234 }
3235
3236 if (context->personality != PERSONALITY_INVALID) {
3237 r = safe_personality(context->personality);
3238 if (r < 0) {
3239 *exit_status = EXIT_PERSONALITY;
3240 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3241 }
3242 }
3243
3244 if (context->utmp_id)
3245 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3246 context->tty_path,
3247 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3248 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3249 USER_PROCESS,
3250 username);
3251
3252 if (uid_is_valid(uid)) {
3253 r = chown_terminal(STDIN_FILENO, uid);
3254 if (r < 0) {
3255 *exit_status = EXIT_STDIN;
3256 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3257 }
3258 }
3259
3260 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3261 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3262 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3263 * touch a single hierarchy too. */
3264 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3265 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3266 if (r < 0) {
3267 *exit_status = EXIT_CGROUP;
3268 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3269 }
3270 }
3271
3272 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3273 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3274 if (r < 0)
3275 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3276 }
3277
3278 r = build_environment(
3279 unit,
3280 context,
3281 params,
3282 n_fds,
3283 home,
3284 username,
3285 shell,
3286 journal_stream_dev,
3287 journal_stream_ino,
3288 &our_env);
3289 if (r < 0) {
3290 *exit_status = EXIT_MEMORY;
3291 return log_oom();
3292 }
3293
3294 r = build_pass_environment(context, &pass_env);
3295 if (r < 0) {
3296 *exit_status = EXIT_MEMORY;
3297 return log_oom();
3298 }
3299
3300 accum_env = strv_env_merge(5,
3301 params->environment,
3302 our_env,
3303 pass_env,
3304 context->environment,
3305 files_env,
3306 NULL);
3307 if (!accum_env) {
3308 *exit_status = EXIT_MEMORY;
3309 return log_oom();
3310 }
3311 accum_env = strv_env_clean(accum_env);
3312
3313 (void) umask(context->umask);
3314
3315 r = setup_keyring(unit, context, params, uid, gid);
3316 if (r < 0) {
3317 *exit_status = EXIT_KEYRING;
3318 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3319 }
3320
3321 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3322 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3323
3324 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3325 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3326
3327 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3328 if (needs_ambient_hack)
3329 needs_setuid = false;
3330 else
3331 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3332
3333 if (needs_sandboxing) {
3334 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3335 * present. The actual MAC context application will happen later, as late as possible, to avoid
3336 * impacting our own code paths. */
3337
3338 #if HAVE_SELINUX
3339 use_selinux = mac_selinux_use();
3340 #endif
3341 #if ENABLE_SMACK
3342 use_smack = mac_smack_use();
3343 #endif
3344 #if HAVE_APPARMOR
3345 use_apparmor = mac_apparmor_use();
3346 #endif
3347 }
3348
3349 if (needs_sandboxing) {
3350 int which_failed;
3351
3352 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3353 * is set here. (See below.) */
3354
3355 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3356 if (r < 0) {
3357 *exit_status = EXIT_LIMITS;
3358 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3359 }
3360 }
3361
3362 if (needs_setuid) {
3363
3364 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3365 * wins here. (See above.) */
3366
3367 if (context->pam_name && username) {
3368 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3369 if (r < 0) {
3370 *exit_status = EXIT_PAM;
3371 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3372 }
3373 }
3374 }
3375
3376 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3377
3378 if (ns_type_supported(NAMESPACE_NET)) {
3379 r = setup_netns(runtime->netns_storage_socket);
3380 if (r < 0) {
3381 *exit_status = EXIT_NETWORK;
3382 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3383 }
3384 } else if (context->network_namespace_path) {
3385 *exit_status = EXIT_NETWORK;
3386 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3387 } else
3388 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3389 }
3390
3391 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3392 if (needs_mount_namespace) {
3393 _cleanup_free_ char *error_path = NULL;
3394
3395 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3396 if (r < 0) {
3397 *exit_status = EXIT_NAMESPACE;
3398 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3399 error_path ? ": " : "", strempty(error_path));
3400 }
3401 }
3402
3403 if (context->protect_hostname) {
3404 if (ns_type_supported(NAMESPACE_UTS)) {
3405 if (unshare(CLONE_NEWUTS) < 0) {
3406 *exit_status = EXIT_NAMESPACE;
3407 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3408 }
3409 } else
3410 log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3411 #if HAVE_SECCOMP
3412 r = seccomp_protect_hostname();
3413 if (r < 0) {
3414 *exit_status = EXIT_SECCOMP;
3415 return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3416 }
3417 #endif
3418 }
3419
3420 /* Drop groups as early as possbile */
3421 if (needs_setuid) {
3422 r = enforce_groups(gid, supplementary_gids, ngids);
3423 if (r < 0) {
3424 *exit_status = EXIT_GROUP;
3425 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3426 }
3427 }
3428
3429 if (needs_sandboxing) {
3430 #if HAVE_SELINUX
3431 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3432 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3433 if (r < 0) {
3434 *exit_status = EXIT_SELINUX_CONTEXT;
3435 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3436 }
3437 }
3438 #endif
3439
3440 if (context->private_users) {
3441 r = setup_private_users(uid, gid);
3442 if (r < 0) {
3443 *exit_status = EXIT_USER;
3444 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3445 }
3446 }
3447 }
3448
3449 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3450 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3451 * however if we have it as we want to keep it open until the final execve(). */
3452
3453 if (params->exec_fd >= 0) {
3454 exec_fd = params->exec_fd;
3455
3456 if (exec_fd < 3 + (int) n_fds) {
3457 int moved_fd;
3458
3459 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3460 * process we are about to execute. */
3461
3462 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3463 if (moved_fd < 0) {
3464 *exit_status = EXIT_FDS;
3465 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3466 }
3467
3468 safe_close(exec_fd);
3469 exec_fd = moved_fd;
3470 } else {
3471 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3472 r = fd_cloexec(exec_fd, true);
3473 if (r < 0) {
3474 *exit_status = EXIT_FDS;
3475 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3476 }
3477 }
3478
3479 fds_with_exec_fd = newa(int, n_fds + 1);
3480 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3481 fds_with_exec_fd[n_fds] = exec_fd;
3482 n_fds_with_exec_fd = n_fds + 1;
3483 } else {
3484 fds_with_exec_fd = fds;
3485 n_fds_with_exec_fd = n_fds;
3486 }
3487
3488 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3489 if (r >= 0)
3490 r = shift_fds(fds, n_fds);
3491 if (r >= 0)
3492 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3493 if (r < 0) {
3494 *exit_status = EXIT_FDS;
3495 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3496 }
3497
3498 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3499 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3500 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3501 * came this far. */
3502
3503 secure_bits = context->secure_bits;
3504
3505 if (needs_sandboxing) {
3506 uint64_t bset;
3507
3508 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3509 * requested. (Note this is placed after the general resource limit initialization, see
3510 * above, in order to take precedence.) */
3511 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3512 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3513 *exit_status = EXIT_LIMITS;
3514 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3515 }
3516 }
3517
3518 #if ENABLE_SMACK
3519 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3520 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3521 if (use_smack) {
3522 r = setup_smack(context, command);
3523 if (r < 0) {
3524 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3525 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3526 }
3527 }
3528 #endif
3529
3530 bset = context->capability_bounding_set;
3531 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3532 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3533 * instead of us doing that */
3534 if (needs_ambient_hack)
3535 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3536 (UINT64_C(1) << CAP_SETUID) |
3537 (UINT64_C(1) << CAP_SETGID);
3538
3539 if (!cap_test_all(bset)) {
3540 r = capability_bounding_set_drop(bset, false);
3541 if (r < 0) {
3542 *exit_status = EXIT_CAPABILITIES;
3543 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3544 }
3545 }
3546
3547 /* This is done before enforce_user, but ambient set
3548 * does not survive over setresuid() if keep_caps is not set. */
3549 if (!needs_ambient_hack &&
3550 context->capability_ambient_set != 0) {
3551 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3552 if (r < 0) {
3553 *exit_status = EXIT_CAPABILITIES;
3554 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3555 }
3556 }
3557 }
3558
3559 /* chroot to root directory first, before we lose the ability to chroot */
3560 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3561 if (r < 0)
3562 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3563
3564 if (needs_setuid) {
3565 if (uid_is_valid(uid)) {
3566 r = enforce_user(context, uid);
3567 if (r < 0) {
3568 *exit_status = EXIT_USER;
3569 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3570 }
3571
3572 if (!needs_ambient_hack &&
3573 context->capability_ambient_set != 0) {
3574
3575 /* Fix the ambient capabilities after user change. */
3576 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3577 if (r < 0) {
3578 *exit_status = EXIT_CAPABILITIES;
3579 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3580 }
3581
3582 /* If we were asked to change user and ambient capabilities
3583 * were requested, we had to add keep-caps to the securebits
3584 * so that we would maintain the inherited capability set
3585 * through the setresuid(). Make sure that the bit is added
3586 * also to the context secure_bits so that we don't try to
3587 * drop the bit away next. */
3588
3589 secure_bits |= 1<<SECURE_KEEP_CAPS;
3590 }
3591 }
3592 }
3593
3594 /* Apply working directory here, because the working directory might be on NFS and only the user running
3595 * this service might have the correct privilege to change to the working directory */
3596 r = apply_working_directory(context, params, home, exit_status);
3597 if (r < 0)
3598 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3599
3600 if (needs_sandboxing) {
3601 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3602 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3603 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3604 * are restricted. */
3605
3606 #if HAVE_SELINUX
3607 if (use_selinux) {
3608 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3609
3610 if (exec_context) {
3611 r = setexeccon(exec_context);
3612 if (r < 0) {
3613 *exit_status = EXIT_SELINUX_CONTEXT;
3614 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3615 }
3616 }
3617 }
3618 #endif
3619
3620 #if HAVE_APPARMOR
3621 if (use_apparmor && context->apparmor_profile) {
3622 r = aa_change_onexec(context->apparmor_profile);
3623 if (r < 0 && !context->apparmor_profile_ignore) {
3624 *exit_status = EXIT_APPARMOR_PROFILE;
3625 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3626 }
3627 }
3628 #endif
3629
3630 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3631 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3632 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3633 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3634 *exit_status = EXIT_SECUREBITS;
3635 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3636 }
3637
3638 if (context_has_no_new_privileges(context))
3639 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3640 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3641 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3642 }
3643
3644 #if HAVE_SECCOMP
3645 r = apply_address_families(unit, context);
3646 if (r < 0) {
3647 *exit_status = EXIT_ADDRESS_FAMILIES;
3648 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3649 }
3650
3651 r = apply_memory_deny_write_execute(unit, context);
3652 if (r < 0) {
3653 *exit_status = EXIT_SECCOMP;
3654 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3655 }
3656
3657 r = apply_restrict_realtime(unit, context);
3658 if (r < 0) {
3659 *exit_status = EXIT_SECCOMP;
3660 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3661 }
3662
3663 r = apply_restrict_suid_sgid(unit, context);
3664 if (r < 0) {
3665 *exit_status = EXIT_SECCOMP;
3666 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3667 }
3668
3669 r = apply_restrict_namespaces(unit, context);
3670 if (r < 0) {
3671 *exit_status = EXIT_SECCOMP;
3672 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3673 }
3674
3675 r = apply_protect_sysctl(unit, context);
3676 if (r < 0) {
3677 *exit_status = EXIT_SECCOMP;
3678 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3679 }
3680
3681 r = apply_protect_kernel_modules(unit, context);
3682 if (r < 0) {
3683 *exit_status = EXIT_SECCOMP;
3684 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3685 }
3686
3687 r = apply_private_devices(unit, context);
3688 if (r < 0) {
3689 *exit_status = EXIT_SECCOMP;
3690 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3691 }
3692
3693 r = apply_syscall_archs(unit, context);
3694 if (r < 0) {
3695 *exit_status = EXIT_SECCOMP;
3696 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3697 }
3698
3699 r = apply_lock_personality(unit, context);
3700 if (r < 0) {
3701 *exit_status = EXIT_SECCOMP;
3702 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3703 }
3704
3705 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3706 * by the filter as little as possible. */
3707 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3708 if (r < 0) {
3709 *exit_status = EXIT_SECCOMP;
3710 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3711 }
3712 #endif
3713 }
3714
3715 if (!strv_isempty(context->unset_environment)) {
3716 char **ee = NULL;
3717
3718 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3719 if (!ee) {
3720 *exit_status = EXIT_MEMORY;
3721 return log_oom();
3722 }
3723
3724 strv_free_and_replace(accum_env, ee);
3725 }
3726
3727 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3728 replaced_argv = replace_env_argv(command->argv, accum_env);
3729 if (!replaced_argv) {
3730 *exit_status = EXIT_MEMORY;
3731 return log_oom();
3732 }
3733 final_argv = replaced_argv;
3734 } else
3735 final_argv = command->argv;
3736
3737 if (DEBUG_LOGGING) {
3738 _cleanup_free_ char *line;
3739
3740 line = exec_command_line(final_argv);
3741 if (line)
3742 log_struct(LOG_DEBUG,
3743 "EXECUTABLE=%s", command->path,
3744 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3745 LOG_UNIT_ID(unit),
3746 LOG_UNIT_INVOCATION_ID(unit));
3747 }
3748
3749 if (exec_fd >= 0) {
3750 uint8_t hot = 1;
3751
3752 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3753 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3754
3755 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3756 *exit_status = EXIT_EXEC;
3757 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3758 }
3759 }
3760
3761 execve(command->path, final_argv, accum_env);
3762 r = -errno;
3763
3764 if (exec_fd >= 0) {
3765 uint8_t hot = 0;
3766
3767 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3768 * that POLLHUP on it no longer means execve() succeeded. */
3769
3770 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3771 *exit_status = EXIT_EXEC;
3772 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3773 }
3774 }
3775
3776 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3777 log_struct_errno(LOG_INFO, r,
3778 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3779 LOG_UNIT_ID(unit),
3780 LOG_UNIT_INVOCATION_ID(unit),
3781 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3782 command->path),
3783 "EXECUTABLE=%s", command->path);
3784 return 0;
3785 }
3786
3787 *exit_status = EXIT_EXEC;
3788 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3789 }
3790
3791 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3792 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3793
3794 int exec_spawn(Unit *unit,
3795 ExecCommand *command,
3796 const ExecContext *context,
3797 const ExecParameters *params,
3798 ExecRuntime *runtime,
3799 DynamicCreds *dcreds,
3800 pid_t *ret) {
3801
3802 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3803 _cleanup_free_ char *subcgroup_path = NULL;
3804 _cleanup_strv_free_ char **files_env = NULL;
3805 size_t n_storage_fds = 0, n_socket_fds = 0;
3806 _cleanup_free_ char *line = NULL;
3807 pid_t pid;
3808
3809 assert(unit);
3810 assert(command);
3811 assert(context);
3812 assert(ret);
3813 assert(params);
3814 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3815
3816 if (context->std_input == EXEC_INPUT_SOCKET ||
3817 context->std_output == EXEC_OUTPUT_SOCKET ||
3818 context->std_error == EXEC_OUTPUT_SOCKET) {
3819
3820 if (params->n_socket_fds > 1) {
3821 log_unit_error(unit, "Got more than one socket.");
3822 return -EINVAL;
3823 }
3824
3825 if (params->n_socket_fds == 0) {
3826 log_unit_error(unit, "Got no socket.");
3827 return -EINVAL;
3828 }
3829
3830 socket_fd = params->fds[0];
3831 } else {
3832 socket_fd = -1;
3833 fds = params->fds;
3834 n_socket_fds = params->n_socket_fds;
3835 n_storage_fds = params->n_storage_fds;
3836 }
3837
3838 r = exec_context_named_iofds(context, params, named_iofds);
3839 if (r < 0)
3840 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3841
3842 r = exec_context_load_environment(unit, context, &files_env);
3843 if (r < 0)
3844 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3845
3846 line = exec_command_line(command->argv);
3847 if (!line)
3848 return log_oom();
3849
3850 log_struct(LOG_DEBUG,
3851 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3852 "EXECUTABLE=%s", command->path,
3853 LOG_UNIT_ID(unit),
3854 LOG_UNIT_INVOCATION_ID(unit));
3855
3856 if (params->cgroup_path) {
3857 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3858 if (r < 0)
3859 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3860 if (r > 0) { /* We are using a child cgroup */
3861 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3862 if (r < 0)
3863 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3864 }
3865 }
3866
3867 pid = fork();
3868 if (pid < 0)
3869 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3870
3871 if (pid == 0) {
3872 int exit_status = EXIT_SUCCESS;
3873
3874 r = exec_child(unit,
3875 command,
3876 context,
3877 params,
3878 runtime,
3879 dcreds,
3880 socket_fd,
3881 named_iofds,
3882 fds,
3883 n_socket_fds,
3884 n_storage_fds,
3885 files_env,
3886 unit->manager->user_lookup_fds[1],
3887 &exit_status);
3888
3889 if (r < 0) {
3890 const char *status =
3891 exit_status_to_string(exit_status,
3892 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
3893
3894 log_struct_errno(LOG_ERR, r,
3895 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3896 LOG_UNIT_ID(unit),
3897 LOG_UNIT_INVOCATION_ID(unit),
3898 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3899 status, command->path),
3900 "EXECUTABLE=%s", command->path);
3901 }
3902
3903 _exit(exit_status);
3904 }
3905
3906 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3907
3908 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3909 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3910 * process will be killed too). */
3911 if (subcgroup_path)
3912 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3913
3914 exec_status_start(&command->exec_status, pid);
3915
3916 *ret = pid;
3917 return 0;
3918 }
3919
3920 void exec_context_init(ExecContext *c) {
3921 ExecDirectoryType i;
3922
3923 assert(c);
3924
3925 c->umask = 0022;
3926 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3927 c->cpu_sched_policy = SCHED_OTHER;
3928 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3929 c->syslog_level_prefix = true;
3930 c->ignore_sigpipe = true;
3931 c->timer_slack_nsec = NSEC_INFINITY;
3932 c->personality = PERSONALITY_INVALID;
3933 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3934 c->directories[i].mode = 0755;
3935 c->capability_bounding_set = CAP_ALL;
3936 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3937 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3938 c->log_level_max = -1;
3939 numa_policy_reset(&c->numa_policy);
3940 }
3941
3942 void exec_context_done(ExecContext *c) {
3943 ExecDirectoryType i;
3944 size_t l;
3945
3946 assert(c);
3947
3948 c->environment = strv_free(c->environment);
3949 c->environment_files = strv_free(c->environment_files);
3950 c->pass_environment = strv_free(c->pass_environment);
3951 c->unset_environment = strv_free(c->unset_environment);
3952
3953 rlimit_free_all(c->rlimit);
3954
3955 for (l = 0; l < 3; l++) {
3956 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3957 c->stdio_file[l] = mfree(c->stdio_file[l]);
3958 }
3959
3960 c->working_directory = mfree(c->working_directory);
3961 c->root_directory = mfree(c->root_directory);
3962 c->root_image = mfree(c->root_image);
3963 c->tty_path = mfree(c->tty_path);
3964 c->syslog_identifier = mfree(c->syslog_identifier);
3965 c->user = mfree(c->user);
3966 c->group = mfree(c->group);
3967
3968 c->supplementary_groups = strv_free(c->supplementary_groups);
3969
3970 c->pam_name = mfree(c->pam_name);
3971
3972 c->read_only_paths = strv_free(c->read_only_paths);
3973 c->read_write_paths = strv_free(c->read_write_paths);
3974 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3975
3976 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3977 c->bind_mounts = NULL;
3978 c->n_bind_mounts = 0;
3979 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3980 c->temporary_filesystems = NULL;
3981 c->n_temporary_filesystems = 0;
3982
3983 cpu_set_reset(&c->cpu_set);
3984 numa_policy_reset(&c->numa_policy);
3985
3986 c->utmp_id = mfree(c->utmp_id);
3987 c->selinux_context = mfree(c->selinux_context);
3988 c->apparmor_profile = mfree(c->apparmor_profile);
3989 c->smack_process_label = mfree(c->smack_process_label);
3990
3991 c->syscall_filter = hashmap_free(c->syscall_filter);
3992 c->syscall_archs = set_free(c->syscall_archs);
3993 c->address_families = set_free(c->address_families);
3994
3995 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3996 c->directories[i].paths = strv_free(c->directories[i].paths);
3997
3998 c->log_level_max = -1;
3999
4000 exec_context_free_log_extra_fields(c);
4001
4002 c->log_rate_limit_interval_usec = 0;
4003 c->log_rate_limit_burst = 0;
4004
4005 c->stdin_data = mfree(c->stdin_data);
4006 c->stdin_data_size = 0;
4007
4008 c->network_namespace_path = mfree(c->network_namespace_path);
4009 }
4010
4011 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4012 char **i;
4013
4014 assert(c);
4015
4016 if (!runtime_prefix)
4017 return 0;
4018
4019 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4020 _cleanup_free_ char *p;
4021
4022 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4023 p = path_join(runtime_prefix, "private", *i);
4024 else
4025 p = path_join(runtime_prefix, *i);
4026 if (!p)
4027 return -ENOMEM;
4028
4029 /* We execute this synchronously, since we need to be sure this is gone when we start the
4030 * service next. */
4031 (void) rm_rf(p, REMOVE_ROOT);
4032 }
4033
4034 return 0;
4035 }
4036
4037 static void exec_command_done(ExecCommand *c) {
4038 assert(c);
4039
4040 c->path = mfree(c->path);
4041 c->argv = strv_free(c->argv);
4042 }
4043
4044 void exec_command_done_array(ExecCommand *c, size_t n) {
4045 size_t i;
4046
4047 for (i = 0; i < n; i++)
4048 exec_command_done(c+i);
4049 }
4050
4051 ExecCommand* exec_command_free_list(ExecCommand *c) {
4052 ExecCommand *i;
4053
4054 while ((i = c)) {
4055 LIST_REMOVE(command, c, i);
4056 exec_command_done(i);
4057 free(i);
4058 }
4059
4060 return NULL;
4061 }
4062
4063 void exec_command_free_array(ExecCommand **c, size_t n) {
4064 size_t i;
4065
4066 for (i = 0; i < n; i++)
4067 c[i] = exec_command_free_list(c[i]);
4068 }
4069
4070 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4071 size_t i;
4072
4073 for (i = 0; i < n; i++)
4074 exec_status_reset(&c[i].exec_status);
4075 }
4076
4077 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4078 size_t i;
4079
4080 for (i = 0; i < n; i++) {
4081 ExecCommand *z;
4082
4083 LIST_FOREACH(command, z, c[i])
4084 exec_status_reset(&z->exec_status);
4085 }
4086 }
4087
4088 typedef struct InvalidEnvInfo {
4089 const Unit *unit;
4090 const char *path;
4091 } InvalidEnvInfo;
4092
4093 static void invalid_env(const char *p, void *userdata) {
4094 InvalidEnvInfo *info = userdata;
4095
4096 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4097 }
4098
4099 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4100 assert(c);
4101
4102 switch (fd_index) {
4103
4104 case STDIN_FILENO:
4105 if (c->std_input != EXEC_INPUT_NAMED_FD)
4106 return NULL;
4107
4108 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4109
4110 case STDOUT_FILENO:
4111 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4112 return NULL;
4113
4114 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4115
4116 case STDERR_FILENO:
4117 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4118 return NULL;
4119
4120 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4121
4122 default:
4123 return NULL;
4124 }
4125 }
4126
4127 static int exec_context_named_iofds(
4128 const ExecContext *c,
4129 const ExecParameters *p,
4130 int named_iofds[static 3]) {
4131
4132 size_t i, targets;
4133 const char* stdio_fdname[3];
4134 size_t n_fds;
4135
4136 assert(c);
4137 assert(p);
4138 assert(named_iofds);
4139
4140 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4141 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4142 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4143
4144 for (i = 0; i < 3; i++)
4145 stdio_fdname[i] = exec_context_fdname(c, i);
4146
4147 n_fds = p->n_storage_fds + p->n_socket_fds;
4148
4149 for (i = 0; i < n_fds && targets > 0; i++)
4150 if (named_iofds[STDIN_FILENO] < 0 &&
4151 c->std_input == EXEC_INPUT_NAMED_FD &&
4152 stdio_fdname[STDIN_FILENO] &&
4153 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4154
4155 named_iofds[STDIN_FILENO] = p->fds[i];
4156 targets--;
4157
4158 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4159 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4160 stdio_fdname[STDOUT_FILENO] &&
4161 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4162
4163 named_iofds[STDOUT_FILENO] = p->fds[i];
4164 targets--;
4165
4166 } else if (named_iofds[STDERR_FILENO] < 0 &&
4167 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4168 stdio_fdname[STDERR_FILENO] &&
4169 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4170
4171 named_iofds[STDERR_FILENO] = p->fds[i];
4172 targets--;
4173 }
4174
4175 return targets == 0 ? 0 : -ENOENT;
4176 }
4177
4178 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4179 char **i, **r = NULL;
4180
4181 assert(c);
4182 assert(l);
4183
4184 STRV_FOREACH(i, c->environment_files) {
4185 char *fn;
4186 int k;
4187 unsigned n;
4188 bool ignore = false;
4189 char **p;
4190 _cleanup_globfree_ glob_t pglob = {};
4191
4192 fn = *i;
4193
4194 if (fn[0] == '-') {
4195 ignore = true;
4196 fn++;
4197 }
4198
4199 if (!path_is_absolute(fn)) {
4200 if (ignore)
4201 continue;
4202
4203 strv_free(r);
4204 return -EINVAL;
4205 }
4206
4207 /* Filename supports globbing, take all matching files */
4208 k = safe_glob(fn, 0, &pglob);
4209 if (k < 0) {
4210 if (ignore)
4211 continue;
4212
4213 strv_free(r);
4214 return k;
4215 }
4216
4217 /* When we don't match anything, -ENOENT should be returned */
4218 assert(pglob.gl_pathc > 0);
4219
4220 for (n = 0; n < pglob.gl_pathc; n++) {
4221 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4222 if (k < 0) {
4223 if (ignore)
4224 continue;
4225
4226 strv_free(r);
4227 return k;
4228 }
4229 /* Log invalid environment variables with filename */
4230 if (p) {
4231 InvalidEnvInfo info = {
4232 .unit = unit,
4233 .path = pglob.gl_pathv[n]
4234 };
4235
4236 p = strv_env_clean_with_callback(p, invalid_env, &info);
4237 }
4238
4239 if (!r)
4240 r = p;
4241 else {
4242 char **m;
4243
4244 m = strv_env_merge(2, r, p);
4245 strv_free(r);
4246 strv_free(p);
4247 if (!m)
4248 return -ENOMEM;
4249
4250 r = m;
4251 }
4252 }
4253 }
4254
4255 *l = r;
4256
4257 return 0;
4258 }
4259
4260 static bool tty_may_match_dev_console(const char *tty) {
4261 _cleanup_free_ char *resolved = NULL;
4262
4263 if (!tty)
4264 return true;
4265
4266 tty = skip_dev_prefix(tty);
4267
4268 /* trivial identity? */
4269 if (streq(tty, "console"))
4270 return true;
4271
4272 if (resolve_dev_console(&resolved) < 0)
4273 return true; /* if we could not resolve, assume it may */
4274
4275 /* "tty0" means the active VC, so it may be the same sometimes */
4276 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4277 }
4278
4279 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4280 assert(ec);
4281
4282 return ec->tty_reset ||
4283 ec->tty_vhangup ||
4284 ec->tty_vt_disallocate ||
4285 is_terminal_input(ec->std_input) ||
4286 is_terminal_output(ec->std_output) ||
4287 is_terminal_output(ec->std_error);
4288 }
4289
4290 bool exec_context_may_touch_console(const ExecContext *ec) {
4291
4292 return exec_context_may_touch_tty(ec) &&
4293 tty_may_match_dev_console(exec_context_tty_path(ec));
4294 }
4295
4296 static void strv_fprintf(FILE *f, char **l) {
4297 char **g;
4298
4299 assert(f);
4300
4301 STRV_FOREACH(g, l)
4302 fprintf(f, " %s", *g);
4303 }
4304
4305 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4306 ExecDirectoryType dt;
4307 char **e, **d;
4308 unsigned i;
4309 int r;
4310
4311 assert(c);
4312 assert(f);
4313
4314 prefix = strempty(prefix);
4315
4316 fprintf(f,
4317 "%sUMask: %04o\n"
4318 "%sWorkingDirectory: %s\n"
4319 "%sRootDirectory: %s\n"
4320 "%sNonBlocking: %s\n"
4321 "%sPrivateTmp: %s\n"
4322 "%sPrivateDevices: %s\n"
4323 "%sProtectKernelTunables: %s\n"
4324 "%sProtectKernelModules: %s\n"
4325 "%sProtectControlGroups: %s\n"
4326 "%sPrivateNetwork: %s\n"
4327 "%sPrivateUsers: %s\n"
4328 "%sProtectHome: %s\n"
4329 "%sProtectSystem: %s\n"
4330 "%sMountAPIVFS: %s\n"
4331 "%sIgnoreSIGPIPE: %s\n"
4332 "%sMemoryDenyWriteExecute: %s\n"
4333 "%sRestrictRealtime: %s\n"
4334 "%sRestrictSUIDSGID: %s\n"
4335 "%sKeyringMode: %s\n"
4336 "%sProtectHostname: %s\n",
4337 prefix, c->umask,
4338 prefix, c->working_directory ? c->working_directory : "/",
4339 prefix, c->root_directory ? c->root_directory : "/",
4340 prefix, yes_no(c->non_blocking),
4341 prefix, yes_no(c->private_tmp),
4342 prefix, yes_no(c->private_devices),
4343 prefix, yes_no(c->protect_kernel_tunables),
4344 prefix, yes_no(c->protect_kernel_modules),
4345 prefix, yes_no(c->protect_control_groups),
4346 prefix, yes_no(c->private_network),
4347 prefix, yes_no(c->private_users),
4348 prefix, protect_home_to_string(c->protect_home),
4349 prefix, protect_system_to_string(c->protect_system),
4350 prefix, yes_no(c->mount_apivfs),
4351 prefix, yes_no(c->ignore_sigpipe),
4352 prefix, yes_no(c->memory_deny_write_execute),
4353 prefix, yes_no(c->restrict_realtime),
4354 prefix, yes_no(c->restrict_suid_sgid),
4355 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4356 prefix, yes_no(c->protect_hostname));
4357
4358 if (c->root_image)
4359 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4360
4361 STRV_FOREACH(e, c->environment)
4362 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4363
4364 STRV_FOREACH(e, c->environment_files)
4365 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4366
4367 STRV_FOREACH(e, c->pass_environment)
4368 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4369
4370 STRV_FOREACH(e, c->unset_environment)
4371 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4372
4373 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4374
4375 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4376 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4377
4378 STRV_FOREACH(d, c->directories[dt].paths)
4379 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4380 }
4381
4382 if (c->nice_set)
4383 fprintf(f,
4384 "%sNice: %i\n",
4385 prefix, c->nice);
4386
4387 if (c->oom_score_adjust_set)
4388 fprintf(f,
4389 "%sOOMScoreAdjust: %i\n",
4390 prefix, c->oom_score_adjust);
4391
4392 for (i = 0; i < RLIM_NLIMITS; i++)
4393 if (c->rlimit[i]) {
4394 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4395 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4396 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4397 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4398 }
4399
4400 if (c->ioprio_set) {
4401 _cleanup_free_ char *class_str = NULL;
4402
4403 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4404 if (r >= 0)
4405 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4406
4407 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4408 }
4409
4410 if (c->cpu_sched_set) {
4411 _cleanup_free_ char *policy_str = NULL;
4412
4413 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4414 if (r >= 0)
4415 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4416
4417 fprintf(f,
4418 "%sCPUSchedulingPriority: %i\n"
4419 "%sCPUSchedulingResetOnFork: %s\n",
4420 prefix, c->cpu_sched_priority,
4421 prefix, yes_no(c->cpu_sched_reset_on_fork));
4422 }
4423
4424 if (c->cpu_set.set) {
4425 _cleanup_free_ char *affinity = NULL;
4426
4427 affinity = cpu_set_to_range_string(&c->cpu_set);
4428 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4429 }
4430
4431 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4432 _cleanup_free_ char *nodes = NULL;
4433
4434 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4435 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4436 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4437 }
4438
4439 if (c->timer_slack_nsec != NSEC_INFINITY)
4440 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4441
4442 fprintf(f,
4443 "%sStandardInput: %s\n"
4444 "%sStandardOutput: %s\n"
4445 "%sStandardError: %s\n",
4446 prefix, exec_input_to_string(c->std_input),
4447 prefix, exec_output_to_string(c->std_output),
4448 prefix, exec_output_to_string(c->std_error));
4449
4450 if (c->std_input == EXEC_INPUT_NAMED_FD)
4451 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4452 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4453 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4454 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4455 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4456
4457 if (c->std_input == EXEC_INPUT_FILE)
4458 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4459 if (c->std_output == EXEC_OUTPUT_FILE)
4460 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4461 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4462 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4463 if (c->std_error == EXEC_OUTPUT_FILE)
4464 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4465 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4466 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4467
4468 if (c->tty_path)
4469 fprintf(f,
4470 "%sTTYPath: %s\n"
4471 "%sTTYReset: %s\n"
4472 "%sTTYVHangup: %s\n"
4473 "%sTTYVTDisallocate: %s\n",
4474 prefix, c->tty_path,
4475 prefix, yes_no(c->tty_reset),
4476 prefix, yes_no(c->tty_vhangup),
4477 prefix, yes_no(c->tty_vt_disallocate));
4478
4479 if (IN_SET(c->std_output,
4480 EXEC_OUTPUT_SYSLOG,
4481 EXEC_OUTPUT_KMSG,
4482 EXEC_OUTPUT_JOURNAL,
4483 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4484 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4485 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4486 IN_SET(c->std_error,
4487 EXEC_OUTPUT_SYSLOG,
4488 EXEC_OUTPUT_KMSG,
4489 EXEC_OUTPUT_JOURNAL,
4490 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4491 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4492 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4493
4494 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4495
4496 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4497 if (r >= 0)
4498 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4499
4500 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4501 if (r >= 0)
4502 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4503 }
4504
4505 if (c->log_level_max >= 0) {
4506 _cleanup_free_ char *t = NULL;
4507
4508 (void) log_level_to_string_alloc(c->log_level_max, &t);
4509
4510 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4511 }
4512
4513 if (c->log_rate_limit_interval_usec > 0) {
4514 char buf_timespan[FORMAT_TIMESPAN_MAX];
4515
4516 fprintf(f,
4517 "%sLogRateLimitIntervalSec: %s\n",
4518 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4519 }
4520
4521 if (c->log_rate_limit_burst > 0)
4522 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4523
4524 if (c->n_log_extra_fields > 0) {
4525 size_t j;
4526
4527 for (j = 0; j < c->n_log_extra_fields; j++) {
4528 fprintf(f, "%sLogExtraFields: ", prefix);
4529 fwrite(c->log_extra_fields[j].iov_base,
4530 1, c->log_extra_fields[j].iov_len,
4531 f);
4532 fputc('\n', f);
4533 }
4534 }
4535
4536 if (c->secure_bits) {
4537 _cleanup_free_ char *str = NULL;
4538
4539 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4540 if (r >= 0)
4541 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4542 }
4543
4544 if (c->capability_bounding_set != CAP_ALL) {
4545 _cleanup_free_ char *str = NULL;
4546
4547 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4548 if (r >= 0)
4549 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4550 }
4551
4552 if (c->capability_ambient_set != 0) {
4553 _cleanup_free_ char *str = NULL;
4554
4555 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4556 if (r >= 0)
4557 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4558 }
4559
4560 if (c->user)
4561 fprintf(f, "%sUser: %s\n", prefix, c->user);
4562 if (c->group)
4563 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4564
4565 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4566
4567 if (!strv_isempty(c->supplementary_groups)) {
4568 fprintf(f, "%sSupplementaryGroups:", prefix);
4569 strv_fprintf(f, c->supplementary_groups);
4570 fputs("\n", f);
4571 }
4572
4573 if (c->pam_name)
4574 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4575
4576 if (!strv_isempty(c->read_write_paths)) {
4577 fprintf(f, "%sReadWritePaths:", prefix);
4578 strv_fprintf(f, c->read_write_paths);
4579 fputs("\n", f);
4580 }
4581
4582 if (!strv_isempty(c->read_only_paths)) {
4583 fprintf(f, "%sReadOnlyPaths:", prefix);
4584 strv_fprintf(f, c->read_only_paths);
4585 fputs("\n", f);
4586 }
4587
4588 if (!strv_isempty(c->inaccessible_paths)) {
4589 fprintf(f, "%sInaccessiblePaths:", prefix);
4590 strv_fprintf(f, c->inaccessible_paths);
4591 fputs("\n", f);
4592 }
4593
4594 if (c->n_bind_mounts > 0)
4595 for (i = 0; i < c->n_bind_mounts; i++)
4596 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4597 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4598 c->bind_mounts[i].ignore_enoent ? "-": "",
4599 c->bind_mounts[i].source,
4600 c->bind_mounts[i].destination,
4601 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4602
4603 if (c->n_temporary_filesystems > 0)
4604 for (i = 0; i < c->n_temporary_filesystems; i++) {
4605 TemporaryFileSystem *t = c->temporary_filesystems + i;
4606
4607 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4608 t->path,
4609 isempty(t->options) ? "" : ":",
4610 strempty(t->options));
4611 }
4612
4613 if (c->utmp_id)
4614 fprintf(f,
4615 "%sUtmpIdentifier: %s\n",
4616 prefix, c->utmp_id);
4617
4618 if (c->selinux_context)
4619 fprintf(f,
4620 "%sSELinuxContext: %s%s\n",
4621 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4622
4623 if (c->apparmor_profile)
4624 fprintf(f,
4625 "%sAppArmorProfile: %s%s\n",
4626 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4627
4628 if (c->smack_process_label)
4629 fprintf(f,
4630 "%sSmackProcessLabel: %s%s\n",
4631 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4632
4633 if (c->personality != PERSONALITY_INVALID)
4634 fprintf(f,
4635 "%sPersonality: %s\n",
4636 prefix, strna(personality_to_string(c->personality)));
4637
4638 fprintf(f,
4639 "%sLockPersonality: %s\n",
4640 prefix, yes_no(c->lock_personality));
4641
4642 if (c->syscall_filter) {
4643 #if HAVE_SECCOMP
4644 Iterator j;
4645 void *id, *val;
4646 bool first = true;
4647 #endif
4648
4649 fprintf(f,
4650 "%sSystemCallFilter: ",
4651 prefix);
4652
4653 if (!c->syscall_whitelist)
4654 fputc('~', f);
4655
4656 #if HAVE_SECCOMP
4657 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4658 _cleanup_free_ char *name = NULL;
4659 const char *errno_name = NULL;
4660 int num = PTR_TO_INT(val);
4661
4662 if (first)
4663 first = false;
4664 else
4665 fputc(' ', f);
4666
4667 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4668 fputs(strna(name), f);
4669
4670 if (num >= 0) {
4671 errno_name = errno_to_name(num);
4672 if (errno_name)
4673 fprintf(f, ":%s", errno_name);
4674 else
4675 fprintf(f, ":%d", num);
4676 }
4677 }
4678 #endif
4679
4680 fputc('\n', f);
4681 }
4682
4683 if (c->syscall_archs) {
4684 #if HAVE_SECCOMP
4685 Iterator j;
4686 void *id;
4687 #endif
4688
4689 fprintf(f,
4690 "%sSystemCallArchitectures:",
4691 prefix);
4692
4693 #if HAVE_SECCOMP
4694 SET_FOREACH(id, c->syscall_archs, j)
4695 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4696 #endif
4697 fputc('\n', f);
4698 }
4699
4700 if (exec_context_restrict_namespaces_set(c)) {
4701 _cleanup_free_ char *s = NULL;
4702
4703 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4704 if (r >= 0)
4705 fprintf(f, "%sRestrictNamespaces: %s\n",
4706 prefix, s);
4707 }
4708
4709 if (c->network_namespace_path)
4710 fprintf(f,
4711 "%sNetworkNamespacePath: %s\n",
4712 prefix, c->network_namespace_path);
4713
4714 if (c->syscall_errno > 0) {
4715 const char *errno_name;
4716
4717 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4718
4719 errno_name = errno_to_name(c->syscall_errno);
4720 if (errno_name)
4721 fprintf(f, "%s\n", errno_name);
4722 else
4723 fprintf(f, "%d\n", c->syscall_errno);
4724 }
4725 }
4726
4727 bool exec_context_maintains_privileges(const ExecContext *c) {
4728 assert(c);
4729
4730 /* Returns true if the process forked off would run under
4731 * an unchanged UID or as root. */
4732
4733 if (!c->user)
4734 return true;
4735
4736 if (streq(c->user, "root") || streq(c->user, "0"))
4737 return true;
4738
4739 return false;
4740 }
4741
4742 int exec_context_get_effective_ioprio(const ExecContext *c) {
4743 int p;
4744
4745 assert(c);
4746
4747 if (c->ioprio_set)
4748 return c->ioprio;
4749
4750 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4751 if (p < 0)
4752 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4753
4754 return p;
4755 }
4756
4757 void exec_context_free_log_extra_fields(ExecContext *c) {
4758 size_t l;
4759
4760 assert(c);
4761
4762 for (l = 0; l < c->n_log_extra_fields; l++)
4763 free(c->log_extra_fields[l].iov_base);
4764 c->log_extra_fields = mfree(c->log_extra_fields);
4765 c->n_log_extra_fields = 0;
4766 }
4767
4768 void exec_context_revert_tty(ExecContext *c) {
4769 int r;
4770
4771 assert(c);
4772
4773 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4774 exec_context_tty_reset(c, NULL);
4775
4776 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4777 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4778 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4779
4780 if (exec_context_may_touch_tty(c)) {
4781 const char *path;
4782
4783 path = exec_context_tty_path(c);
4784 if (path) {
4785 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4786 if (r < 0 && r != -ENOENT)
4787 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4788 }
4789 }
4790 }
4791
4792 int exec_context_get_clean_directories(
4793 ExecContext *c,
4794 char **prefix,
4795 ExecCleanMask mask,
4796 char ***ret) {
4797
4798 _cleanup_strv_free_ char **l = NULL;
4799 ExecDirectoryType t;
4800 int r;
4801
4802 assert(c);
4803 assert(prefix);
4804 assert(ret);
4805
4806 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4807 char **i;
4808
4809 if (!FLAGS_SET(mask, 1U << t))
4810 continue;
4811
4812 if (!prefix[t])
4813 continue;
4814
4815 STRV_FOREACH(i, c->directories[t].paths) {
4816 char *j;
4817
4818 j = path_join(prefix[t], *i);
4819 if (!j)
4820 return -ENOMEM;
4821
4822 r = strv_consume(&l, j);
4823 if (r < 0)
4824 return r;
4825
4826 /* Also remove private directories unconditionally. */
4827 if (t != EXEC_DIRECTORY_CONFIGURATION) {
4828 j = path_join(prefix[t], "private", *i);
4829 if (!j)
4830 return -ENOMEM;
4831
4832 r = strv_consume(&l, j);
4833 if (r < 0)
4834 return r;
4835 }
4836 }
4837 }
4838
4839 *ret = TAKE_PTR(l);
4840 return 0;
4841 }
4842
4843 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
4844 ExecCleanMask mask = 0;
4845
4846 assert(c);
4847 assert(ret);
4848
4849 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4850 if (!strv_isempty(c->directories[t].paths))
4851 mask |= 1U << t;
4852
4853 *ret = mask;
4854 return 0;
4855 }
4856
4857 void exec_status_start(ExecStatus *s, pid_t pid) {
4858 assert(s);
4859
4860 *s = (ExecStatus) {
4861 .pid = pid,
4862 };
4863
4864 dual_timestamp_get(&s->start_timestamp);
4865 }
4866
4867 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4868 assert(s);
4869
4870 if (s->pid != pid) {
4871 *s = (ExecStatus) {
4872 .pid = pid,
4873 };
4874 }
4875
4876 dual_timestamp_get(&s->exit_timestamp);
4877
4878 s->code = code;
4879 s->status = status;
4880
4881 if (context && context->utmp_id)
4882 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4883 }
4884
4885 void exec_status_reset(ExecStatus *s) {
4886 assert(s);
4887
4888 *s = (ExecStatus) {};
4889 }
4890
4891 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4892 char buf[FORMAT_TIMESTAMP_MAX];
4893
4894 assert(s);
4895 assert(f);
4896
4897 if (s->pid <= 0)
4898 return;
4899
4900 prefix = strempty(prefix);
4901
4902 fprintf(f,
4903 "%sPID: "PID_FMT"\n",
4904 prefix, s->pid);
4905
4906 if (dual_timestamp_is_set(&s->start_timestamp))
4907 fprintf(f,
4908 "%sStart Timestamp: %s\n",
4909 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4910
4911 if (dual_timestamp_is_set(&s->exit_timestamp))
4912 fprintf(f,
4913 "%sExit Timestamp: %s\n"
4914 "%sExit Code: %s\n"
4915 "%sExit Status: %i\n",
4916 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4917 prefix, sigchld_code_to_string(s->code),
4918 prefix, s->status);
4919 }
4920
4921 static char *exec_command_line(char **argv) {
4922 size_t k;
4923 char *n, *p, **a;
4924 bool first = true;
4925
4926 assert(argv);
4927
4928 k = 1;
4929 STRV_FOREACH(a, argv)
4930 k += strlen(*a)+3;
4931
4932 n = new(char, k);
4933 if (!n)
4934 return NULL;
4935
4936 p = n;
4937 STRV_FOREACH(a, argv) {
4938
4939 if (!first)
4940 *(p++) = ' ';
4941 else
4942 first = false;
4943
4944 if (strpbrk(*a, WHITESPACE)) {
4945 *(p++) = '\'';
4946 p = stpcpy(p, *a);
4947 *(p++) = '\'';
4948 } else
4949 p = stpcpy(p, *a);
4950
4951 }
4952
4953 *p = 0;
4954
4955 /* FIXME: this doesn't really handle arguments that have
4956 * spaces and ticks in them */
4957
4958 return n;
4959 }
4960
4961 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4962 _cleanup_free_ char *cmd = NULL;
4963 const char *prefix2;
4964
4965 assert(c);
4966 assert(f);
4967
4968 prefix = strempty(prefix);
4969 prefix2 = strjoina(prefix, "\t");
4970
4971 cmd = exec_command_line(c->argv);
4972 fprintf(f,
4973 "%sCommand Line: %s\n",
4974 prefix, cmd ? cmd : strerror_safe(ENOMEM));
4975
4976 exec_status_dump(&c->exec_status, f, prefix2);
4977 }
4978
4979 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4980 assert(f);
4981
4982 prefix = strempty(prefix);
4983
4984 LIST_FOREACH(command, c, c)
4985 exec_command_dump(c, f, prefix);
4986 }
4987
4988 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4989 ExecCommand *end;
4990
4991 assert(l);
4992 assert(e);
4993
4994 if (*l) {
4995 /* It's kind of important, that we keep the order here */
4996 LIST_FIND_TAIL(command, *l, end);
4997 LIST_INSERT_AFTER(command, *l, end, e);
4998 } else
4999 *l = e;
5000 }
5001
5002 int exec_command_set(ExecCommand *c, const char *path, ...) {
5003 va_list ap;
5004 char **l, *p;
5005
5006 assert(c);
5007 assert(path);
5008
5009 va_start(ap, path);
5010 l = strv_new_ap(path, ap);
5011 va_end(ap);
5012
5013 if (!l)
5014 return -ENOMEM;
5015
5016 p = strdup(path);
5017 if (!p) {
5018 strv_free(l);
5019 return -ENOMEM;
5020 }
5021
5022 free_and_replace(c->path, p);
5023
5024 return strv_free_and_replace(c->argv, l);
5025 }
5026
5027 int exec_command_append(ExecCommand *c, const char *path, ...) {
5028 _cleanup_strv_free_ char **l = NULL;
5029 va_list ap;
5030 int r;
5031
5032 assert(c);
5033 assert(path);
5034
5035 va_start(ap, path);
5036 l = strv_new_ap(path, ap);
5037 va_end(ap);
5038
5039 if (!l)
5040 return -ENOMEM;
5041
5042 r = strv_extend_strv(&c->argv, l, false);
5043 if (r < 0)
5044 return r;
5045
5046 return 0;
5047 }
5048
5049 static void *remove_tmpdir_thread(void *p) {
5050 _cleanup_free_ char *path = p;
5051
5052 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5053 return NULL;
5054 }
5055
5056 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5057 int r;
5058
5059 if (!rt)
5060 return NULL;
5061
5062 if (rt->manager)
5063 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5064
5065 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5066 if (destroy && rt->tmp_dir) {
5067 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5068
5069 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5070 if (r < 0) {
5071 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5072 free(rt->tmp_dir);
5073 }
5074
5075 rt->tmp_dir = NULL;
5076 }
5077
5078 if (destroy && rt->var_tmp_dir) {
5079 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5080
5081 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5082 if (r < 0) {
5083 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5084 free(rt->var_tmp_dir);
5085 }
5086
5087 rt->var_tmp_dir = NULL;
5088 }
5089
5090 rt->id = mfree(rt->id);
5091 rt->tmp_dir = mfree(rt->tmp_dir);
5092 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5093 safe_close_pair(rt->netns_storage_socket);
5094 return mfree(rt);
5095 }
5096
5097 static void exec_runtime_freep(ExecRuntime **rt) {
5098 (void) exec_runtime_free(*rt, false);
5099 }
5100
5101 static int exec_runtime_allocate(ExecRuntime **ret) {
5102 ExecRuntime *n;
5103
5104 assert(ret);
5105
5106 n = new(ExecRuntime, 1);
5107 if (!n)
5108 return -ENOMEM;
5109
5110 *n = (ExecRuntime) {
5111 .netns_storage_socket = { -1, -1 },
5112 };
5113
5114 *ret = n;
5115 return 0;
5116 }
5117
5118 static int exec_runtime_add(
5119 Manager *m,
5120 const char *id,
5121 const char *tmp_dir,
5122 const char *var_tmp_dir,
5123 const int netns_storage_socket[2],
5124 ExecRuntime **ret) {
5125
5126 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5127 int r;
5128
5129 assert(m);
5130 assert(id);
5131
5132 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5133 if (r < 0)
5134 return r;
5135
5136 r = exec_runtime_allocate(&rt);
5137 if (r < 0)
5138 return r;
5139
5140 rt->id = strdup(id);
5141 if (!rt->id)
5142 return -ENOMEM;
5143
5144 if (tmp_dir) {
5145 rt->tmp_dir = strdup(tmp_dir);
5146 if (!rt->tmp_dir)
5147 return -ENOMEM;
5148
5149 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5150 assert(var_tmp_dir);
5151 rt->var_tmp_dir = strdup(var_tmp_dir);
5152 if (!rt->var_tmp_dir)
5153 return -ENOMEM;
5154 }
5155
5156 if (netns_storage_socket) {
5157 rt->netns_storage_socket[0] = netns_storage_socket[0];
5158 rt->netns_storage_socket[1] = netns_storage_socket[1];
5159 }
5160
5161 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5162 if (r < 0)
5163 return r;
5164
5165 rt->manager = m;
5166
5167 if (ret)
5168 *ret = rt;
5169
5170 /* do not remove created ExecRuntime object when the operation succeeds. */
5171 rt = NULL;
5172 return 0;
5173 }
5174
5175 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5176 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5177 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5178 int r;
5179
5180 assert(m);
5181 assert(c);
5182 assert(id);
5183
5184 /* It is not necessary to create ExecRuntime object. */
5185 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5186 return 0;
5187
5188 if (c->private_tmp) {
5189 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5190 if (r < 0)
5191 return r;
5192 }
5193
5194 if (c->private_network || c->network_namespace_path) {
5195 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5196 return -errno;
5197 }
5198
5199 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5200 if (r < 0)
5201 return r;
5202
5203 /* Avoid cleanup */
5204 netns_storage_socket[0] = netns_storage_socket[1] = -1;
5205 return 1;
5206 }
5207
5208 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5209 ExecRuntime *rt;
5210 int r;
5211
5212 assert(m);
5213 assert(id);
5214 assert(ret);
5215
5216 rt = hashmap_get(m->exec_runtime_by_id, id);
5217 if (rt)
5218 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5219 goto ref;
5220
5221 if (!create)
5222 return 0;
5223
5224 /* If not found, then create a new object. */
5225 r = exec_runtime_make(m, c, id, &rt);
5226 if (r <= 0)
5227 /* When r == 0, it is not necessary to create ExecRuntime object. */
5228 return r;
5229
5230 ref:
5231 /* increment reference counter. */
5232 rt->n_ref++;
5233 *ret = rt;
5234 return 1;
5235 }
5236
5237 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5238 if (!rt)
5239 return NULL;
5240
5241 assert(rt->n_ref > 0);
5242
5243 rt->n_ref--;
5244 if (rt->n_ref > 0)
5245 return NULL;
5246
5247 return exec_runtime_free(rt, destroy);
5248 }
5249
5250 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5251 ExecRuntime *rt;
5252 Iterator i;
5253
5254 assert(m);
5255 assert(f);
5256 assert(fds);
5257
5258 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5259 fprintf(f, "exec-runtime=%s", rt->id);
5260
5261 if (rt->tmp_dir)
5262 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5263
5264 if (rt->var_tmp_dir)
5265 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5266
5267 if (rt->netns_storage_socket[0] >= 0) {
5268 int copy;
5269
5270 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5271 if (copy < 0)
5272 return copy;
5273
5274 fprintf(f, " netns-socket-0=%i", copy);
5275 }
5276
5277 if (rt->netns_storage_socket[1] >= 0) {
5278 int copy;
5279
5280 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5281 if (copy < 0)
5282 return copy;
5283
5284 fprintf(f, " netns-socket-1=%i", copy);
5285 }
5286
5287 fputc('\n', f);
5288 }
5289
5290 return 0;
5291 }
5292
5293 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5294 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5295 ExecRuntime *rt;
5296 int r;
5297
5298 /* This is for the migration from old (v237 or earlier) deserialization text.
5299 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5300 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5301 * so or not from the serialized text, then we always creates a new object owned by this. */
5302
5303 assert(u);
5304 assert(key);
5305 assert(value);
5306
5307 /* Manager manages ExecRuntime objects by the unit id.
5308 * So, we omit the serialized text when the unit does not have id (yet?)... */
5309 if (isempty(u->id)) {
5310 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5311 return 0;
5312 }
5313
5314 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5315 if (r < 0) {
5316 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5317 return 0;
5318 }
5319
5320 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5321 if (!rt) {
5322 r = exec_runtime_allocate(&rt_create);
5323 if (r < 0)
5324 return log_oom();
5325
5326 rt_create->id = strdup(u->id);
5327 if (!rt_create->id)
5328 return log_oom();
5329
5330 rt = rt_create;
5331 }
5332
5333 if (streq(key, "tmp-dir")) {
5334 char *copy;
5335
5336 copy = strdup(value);
5337 if (!copy)
5338 return log_oom();
5339
5340 free_and_replace(rt->tmp_dir, copy);
5341
5342 } else if (streq(key, "var-tmp-dir")) {
5343 char *copy;
5344
5345 copy = strdup(value);
5346 if (!copy)
5347 return log_oom();
5348
5349 free_and_replace(rt->var_tmp_dir, copy);
5350
5351 } else if (streq(key, "netns-socket-0")) {
5352 int fd;
5353
5354 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5355 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5356 return 0;
5357 }
5358
5359 safe_close(rt->netns_storage_socket[0]);
5360 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5361
5362 } else if (streq(key, "netns-socket-1")) {
5363 int fd;
5364
5365 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5366 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5367 return 0;
5368 }
5369
5370 safe_close(rt->netns_storage_socket[1]);
5371 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5372 } else
5373 return 0;
5374
5375 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5376 if (rt_create) {
5377 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5378 if (r < 0) {
5379 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5380 return 0;
5381 }
5382
5383 rt_create->manager = u->manager;
5384
5385 /* Avoid cleanup */
5386 rt_create = NULL;
5387 }
5388
5389 return 1;
5390 }
5391
5392 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5393 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5394 int r, fd0 = -1, fd1 = -1;
5395 const char *p, *v = value;
5396 size_t n;
5397
5398 assert(m);
5399 assert(value);
5400 assert(fds);
5401
5402 n = strcspn(v, " ");
5403 id = strndupa(v, n);
5404 if (v[n] != ' ')
5405 goto finalize;
5406 p = v + n + 1;
5407
5408 v = startswith(p, "tmp-dir=");
5409 if (v) {
5410 n = strcspn(v, " ");
5411 tmp_dir = strndupa(v, n);
5412 if (v[n] != ' ')
5413 goto finalize;
5414 p = v + n + 1;
5415 }
5416
5417 v = startswith(p, "var-tmp-dir=");
5418 if (v) {
5419 n = strcspn(v, " ");
5420 var_tmp_dir = strndupa(v, n);
5421 if (v[n] != ' ')
5422 goto finalize;
5423 p = v + n + 1;
5424 }
5425
5426 v = startswith(p, "netns-socket-0=");
5427 if (v) {
5428 char *buf;
5429
5430 n = strcspn(v, " ");
5431 buf = strndupa(v, n);
5432 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5433 log_debug("Unable to process exec-runtime netns fd specification.");
5434 return;
5435 }
5436 fd0 = fdset_remove(fds, fd0);
5437 if (v[n] != ' ')
5438 goto finalize;
5439 p = v + n + 1;
5440 }
5441
5442 v = startswith(p, "netns-socket-1=");
5443 if (v) {
5444 char *buf;
5445
5446 n = strcspn(v, " ");
5447 buf = strndupa(v, n);
5448 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5449 log_debug("Unable to process exec-runtime netns fd specification.");
5450 return;
5451 }
5452 fd1 = fdset_remove(fds, fd1);
5453 }
5454
5455 finalize:
5456
5457 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5458 if (r < 0)
5459 log_debug_errno(r, "Failed to add exec-runtime: %m");
5460 }
5461
5462 void exec_runtime_vacuum(Manager *m) {
5463 ExecRuntime *rt;
5464 Iterator i;
5465
5466 assert(m);
5467
5468 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5469
5470 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5471 if (rt->n_ref > 0)
5472 continue;
5473
5474 (void) exec_runtime_free(rt, false);
5475 }
5476 }
5477
5478 void exec_params_clear(ExecParameters *p) {
5479 if (!p)
5480 return;
5481
5482 strv_free(p->environment);
5483 }
5484
5485 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5486 [EXEC_INPUT_NULL] = "null",
5487 [EXEC_INPUT_TTY] = "tty",
5488 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5489 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5490 [EXEC_INPUT_SOCKET] = "socket",
5491 [EXEC_INPUT_NAMED_FD] = "fd",
5492 [EXEC_INPUT_DATA] = "data",
5493 [EXEC_INPUT_FILE] = "file",
5494 };
5495
5496 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5497
5498 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5499 [EXEC_OUTPUT_INHERIT] = "inherit",
5500 [EXEC_OUTPUT_NULL] = "null",
5501 [EXEC_OUTPUT_TTY] = "tty",
5502 [EXEC_OUTPUT_SYSLOG] = "syslog",
5503 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5504 [EXEC_OUTPUT_KMSG] = "kmsg",
5505 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5506 [EXEC_OUTPUT_JOURNAL] = "journal",
5507 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5508 [EXEC_OUTPUT_SOCKET] = "socket",
5509 [EXEC_OUTPUT_NAMED_FD] = "fd",
5510 [EXEC_OUTPUT_FILE] = "file",
5511 [EXEC_OUTPUT_FILE_APPEND] = "append",
5512 };
5513
5514 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5515
5516 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5517 [EXEC_UTMP_INIT] = "init",
5518 [EXEC_UTMP_LOGIN] = "login",
5519 [EXEC_UTMP_USER] = "user",
5520 };
5521
5522 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5523
5524 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5525 [EXEC_PRESERVE_NO] = "no",
5526 [EXEC_PRESERVE_YES] = "yes",
5527 [EXEC_PRESERVE_RESTART] = "restart",
5528 };
5529
5530 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5531
5532 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5533 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5534 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5535 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5536 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5537 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5538 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5539 };
5540
5541 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5542
5543 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5544 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5545 * directories, specifically .timer units with their timestamp touch file. */
5546 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5547 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5548 [EXEC_DIRECTORY_STATE] = "state",
5549 [EXEC_DIRECTORY_CACHE] = "cache",
5550 [EXEC_DIRECTORY_LOGS] = "logs",
5551 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5552 };
5553
5554 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5555
5556 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5557 * the service payload in. */
5558 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5559 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5560 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5561 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5562 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5563 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5564 };
5565
5566 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5567
5568 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5569 [EXEC_KEYRING_INHERIT] = "inherit",
5570 [EXEC_KEYRING_PRIVATE] = "private",
5571 [EXEC_KEYRING_SHARED] = "shared",
5572 };
5573
5574 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);