]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
tree-wide: port various users over to sockaddr_un_set_path()
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <glob.h>
6 #include <grp.h>
7 #include <poll.h>
8 #include <signal.h>
9 #include <string.h>
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
12 #include <sys/mman.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/shm.h>
16 #include <sys/socket.h>
17 #include <sys/stat.h>
18 #include <sys/types.h>
19 #include <sys/un.h>
20 #include <unistd.h>
21 #include <utmpx.h>
22
23 #if HAVE_PAM
24 #include <security/pam_appl.h>
25 #endif
26
27 #if HAVE_SELINUX
28 #include <selinux/selinux.h>
29 #endif
30
31 #if HAVE_SECCOMP
32 #include <seccomp.h>
33 #endif
34
35 #if HAVE_APPARMOR
36 #include <sys/apparmor.h>
37 #endif
38
39 #include "sd-messages.h"
40
41 #include "af-list.h"
42 #include "alloc-util.h"
43 #if HAVE_APPARMOR
44 #include "apparmor-util.h"
45 #endif
46 #include "async.h"
47 #include "barrier.h"
48 #include "cap-list.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cpu-set-util.h"
52 #include "def.h"
53 #include "env-util.h"
54 #include "errno-list.h"
55 #include "execute.h"
56 #include "exit-status.h"
57 #include "fd-util.h"
58 #include "fileio.h"
59 #include "format-util.h"
60 #include "fs-util.h"
61 #include "glob-util.h"
62 #include "io-util.h"
63 #include "ioprio.h"
64 #include "label.h"
65 #include "log.h"
66 #include "macro.h"
67 #include "manager.h"
68 #include "missing.h"
69 #include "mkdir.h"
70 #include "namespace.h"
71 #include "parse-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
74 #include "rlimit-util.h"
75 #include "rm-rf.h"
76 #if HAVE_SECCOMP
77 #include "seccomp-util.h"
78 #endif
79 #include "securebits.h"
80 #include "securebits-util.h"
81 #include "selinux-util.h"
82 #include "signal-util.h"
83 #include "smack-util.h"
84 #include "socket-util.h"
85 #include "special.h"
86 #include "stat-util.h"
87 #include "string-table.h"
88 #include "string-util.h"
89 #include "strv.h"
90 #include "syslog-util.h"
91 #include "terminal-util.h"
92 #include "umask-util.h"
93 #include "unit.h"
94 #include "user-util.h"
95 #include "util.h"
96 #include "utmp-wtmp.h"
97
98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
100
101 /* This assumes there is a 'tty' group */
102 #define TTY_MODE 0620
103
104 #define SNDBUF_SIZE (8*1024*1024)
105
106 static int shift_fds(int fds[], size_t n_fds) {
107 int start, restart_from;
108
109 if (n_fds <= 0)
110 return 0;
111
112 /* Modifies the fds array! (sorts it) */
113
114 assert(fds);
115
116 start = 0;
117 for (;;) {
118 int i;
119
120 restart_from = -1;
121
122 for (i = start; i < (int) n_fds; i++) {
123 int nfd;
124
125 /* Already at right index? */
126 if (fds[i] == i+3)
127 continue;
128
129 nfd = fcntl(fds[i], F_DUPFD, i + 3);
130 if (nfd < 0)
131 return -errno;
132
133 safe_close(fds[i]);
134 fds[i] = nfd;
135
136 /* Hmm, the fd we wanted isn't free? Then
137 * let's remember that and try again from here */
138 if (nfd != i+3 && restart_from < 0)
139 restart_from = i;
140 }
141
142 if (restart_from < 0)
143 break;
144
145 start = restart_from;
146 }
147
148 return 0;
149 }
150
151 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
152 size_t i, n_fds;
153 int r;
154
155 n_fds = n_socket_fds + n_storage_fds;
156 if (n_fds <= 0)
157 return 0;
158
159 assert(fds);
160
161 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
162 * O_NONBLOCK only applies to socket activation though. */
163
164 for (i = 0; i < n_fds; i++) {
165
166 if (i < n_socket_fds) {
167 r = fd_nonblock(fds[i], nonblock);
168 if (r < 0)
169 return r;
170 }
171
172 /* We unconditionally drop FD_CLOEXEC from the fds,
173 * since after all we want to pass these fds to our
174 * children */
175
176 r = fd_cloexec(fds[i], false);
177 if (r < 0)
178 return r;
179 }
180
181 return 0;
182 }
183
184 static const char *exec_context_tty_path(const ExecContext *context) {
185 assert(context);
186
187 if (context->stdio_as_fds)
188 return NULL;
189
190 if (context->tty_path)
191 return context->tty_path;
192
193 return "/dev/console";
194 }
195
196 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
197 const char *path;
198
199 assert(context);
200
201 path = exec_context_tty_path(context);
202
203 if (context->tty_vhangup) {
204 if (p && p->stdin_fd >= 0)
205 (void) terminal_vhangup_fd(p->stdin_fd);
206 else if (path)
207 (void) terminal_vhangup(path);
208 }
209
210 if (context->tty_reset) {
211 if (p && p->stdin_fd >= 0)
212 (void) reset_terminal_fd(p->stdin_fd, true);
213 else if (path)
214 (void) reset_terminal(path);
215 }
216
217 if (context->tty_vt_disallocate && path)
218 (void) vt_disallocate(path);
219 }
220
221 static bool is_terminal_input(ExecInput i) {
222 return IN_SET(i,
223 EXEC_INPUT_TTY,
224 EXEC_INPUT_TTY_FORCE,
225 EXEC_INPUT_TTY_FAIL);
226 }
227
228 static bool is_terminal_output(ExecOutput o) {
229 return IN_SET(o,
230 EXEC_OUTPUT_TTY,
231 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
232 EXEC_OUTPUT_KMSG_AND_CONSOLE,
233 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
234 }
235
236 static bool is_syslog_output(ExecOutput o) {
237 return IN_SET(o,
238 EXEC_OUTPUT_SYSLOG,
239 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
240 }
241
242 static bool is_kmsg_output(ExecOutput o) {
243 return IN_SET(o,
244 EXEC_OUTPUT_KMSG,
245 EXEC_OUTPUT_KMSG_AND_CONSOLE);
246 }
247
248 static bool exec_context_needs_term(const ExecContext *c) {
249 assert(c);
250
251 /* Return true if the execution context suggests we should set $TERM to something useful. */
252
253 if (is_terminal_input(c->std_input))
254 return true;
255
256 if (is_terminal_output(c->std_output))
257 return true;
258
259 if (is_terminal_output(c->std_error))
260 return true;
261
262 return !!c->tty_path;
263 }
264
265 static int open_null_as(int flags, int nfd) {
266 int fd;
267
268 assert(nfd >= 0);
269
270 fd = open("/dev/null", flags|O_NOCTTY);
271 if (fd < 0)
272 return -errno;
273
274 return move_fd(fd, nfd, false);
275 }
276
277 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
278 static const union sockaddr_union sa = {
279 .un.sun_family = AF_UNIX,
280 .un.sun_path = "/run/systemd/journal/stdout",
281 };
282 uid_t olduid = UID_INVALID;
283 gid_t oldgid = GID_INVALID;
284 int r;
285
286 if (gid_is_valid(gid)) {
287 oldgid = getgid();
288
289 if (setegid(gid) < 0)
290 return -errno;
291 }
292
293 if (uid_is_valid(uid)) {
294 olduid = getuid();
295
296 if (seteuid(uid) < 0) {
297 r = -errno;
298 goto restore_gid;
299 }
300 }
301
302 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
303
304 /* If we fail to restore the uid or gid, things will likely
305 fail later on. This should only happen if an LSM interferes. */
306
307 if (uid_is_valid(uid))
308 (void) seteuid(olduid);
309
310 restore_gid:
311 if (gid_is_valid(gid))
312 (void) setegid(oldgid);
313
314 return r;
315 }
316
317 static int connect_logger_as(
318 const Unit *unit,
319 const ExecContext *context,
320 const ExecParameters *params,
321 ExecOutput output,
322 const char *ident,
323 int nfd,
324 uid_t uid,
325 gid_t gid) {
326
327 int fd, r;
328
329 assert(context);
330 assert(params);
331 assert(output < _EXEC_OUTPUT_MAX);
332 assert(ident);
333 assert(nfd >= 0);
334
335 fd = socket(AF_UNIX, SOCK_STREAM, 0);
336 if (fd < 0)
337 return -errno;
338
339 r = connect_journal_socket(fd, uid, gid);
340 if (r < 0)
341 return r;
342
343 if (shutdown(fd, SHUT_RD) < 0) {
344 safe_close(fd);
345 return -errno;
346 }
347
348 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
349
350 dprintf(fd,
351 "%s\n"
352 "%s\n"
353 "%i\n"
354 "%i\n"
355 "%i\n"
356 "%i\n"
357 "%i\n",
358 context->syslog_identifier ?: ident,
359 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
360 context->syslog_priority,
361 !!context->syslog_level_prefix,
362 is_syslog_output(output),
363 is_kmsg_output(output),
364 is_terminal_output(output));
365
366 return move_fd(fd, nfd, false);
367 }
368 static int open_terminal_as(const char *path, int flags, int nfd) {
369 int fd;
370
371 assert(path);
372 assert(nfd >= 0);
373
374 fd = open_terminal(path, flags | O_NOCTTY);
375 if (fd < 0)
376 return fd;
377
378 return move_fd(fd, nfd, false);
379 }
380
381 static int acquire_path(const char *path, int flags, mode_t mode) {
382 union sockaddr_union sa = {};
383 _cleanup_close_ int fd = -1;
384 int r, salen;
385
386 assert(path);
387
388 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
389 flags |= O_CREAT;
390
391 fd = open(path, flags|O_NOCTTY, mode);
392 if (fd >= 0)
393 return TAKE_FD(fd);
394
395 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
396 return -errno;
397 if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
398 return -ENXIO;
399
400 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
401
402 fd = socket(AF_UNIX, SOCK_STREAM, 0);
403 if (fd < 0)
404 return -errno;
405
406 salen = sockaddr_un_set_path(&sa.un, path);
407 if (salen < 0)
408 return salen;
409
410 if (connect(fd, &sa.sa, salen) < 0)
411 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
412 * indication that his wasn't an AF_UNIX socket after all */
413
414 if ((flags & O_ACCMODE) == O_RDONLY)
415 r = shutdown(fd, SHUT_WR);
416 else if ((flags & O_ACCMODE) == O_WRONLY)
417 r = shutdown(fd, SHUT_RD);
418 else
419 return TAKE_FD(fd);
420 if (r < 0)
421 return -errno;
422
423 return TAKE_FD(fd);
424 }
425
426 static int fixup_input(
427 const ExecContext *context,
428 int socket_fd,
429 bool apply_tty_stdin) {
430
431 ExecInput std_input;
432
433 assert(context);
434
435 std_input = context->std_input;
436
437 if (is_terminal_input(std_input) && !apply_tty_stdin)
438 return EXEC_INPUT_NULL;
439
440 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
441 return EXEC_INPUT_NULL;
442
443 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
444 return EXEC_INPUT_NULL;
445
446 return std_input;
447 }
448
449 static int fixup_output(ExecOutput std_output, int socket_fd) {
450
451 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
452 return EXEC_OUTPUT_INHERIT;
453
454 return std_output;
455 }
456
457 static int setup_input(
458 const ExecContext *context,
459 const ExecParameters *params,
460 int socket_fd,
461 int named_iofds[3]) {
462
463 ExecInput i;
464
465 assert(context);
466 assert(params);
467
468 if (params->stdin_fd >= 0) {
469 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
470 return -errno;
471
472 /* Try to make this the controlling tty, if it is a tty, and reset it */
473 if (isatty(STDIN_FILENO)) {
474 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
475 (void) reset_terminal_fd(STDIN_FILENO, true);
476 }
477
478 return STDIN_FILENO;
479 }
480
481 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
482
483 switch (i) {
484
485 case EXEC_INPUT_NULL:
486 return open_null_as(O_RDONLY, STDIN_FILENO);
487
488 case EXEC_INPUT_TTY:
489 case EXEC_INPUT_TTY_FORCE:
490 case EXEC_INPUT_TTY_FAIL: {
491 int fd;
492
493 fd = acquire_terminal(exec_context_tty_path(context),
494 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
495 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
496 ACQUIRE_TERMINAL_WAIT,
497 USEC_INFINITY);
498 if (fd < 0)
499 return fd;
500
501 return move_fd(fd, STDIN_FILENO, false);
502 }
503
504 case EXEC_INPUT_SOCKET:
505 assert(socket_fd >= 0);
506
507 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
508
509 case EXEC_INPUT_NAMED_FD:
510 assert(named_iofds[STDIN_FILENO] >= 0);
511
512 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
513 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
514
515 case EXEC_INPUT_DATA: {
516 int fd;
517
518 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
519 if (fd < 0)
520 return fd;
521
522 return move_fd(fd, STDIN_FILENO, false);
523 }
524
525 case EXEC_INPUT_FILE: {
526 bool rw;
527 int fd;
528
529 assert(context->stdio_file[STDIN_FILENO]);
530
531 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
532 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
533
534 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
535 if (fd < 0)
536 return fd;
537
538 return move_fd(fd, STDIN_FILENO, false);
539 }
540
541 default:
542 assert_not_reached("Unknown input type");
543 }
544 }
545
546 static int setup_output(
547 const Unit *unit,
548 const ExecContext *context,
549 const ExecParameters *params,
550 int fileno,
551 int socket_fd,
552 int named_iofds[3],
553 const char *ident,
554 uid_t uid,
555 gid_t gid,
556 dev_t *journal_stream_dev,
557 ino_t *journal_stream_ino) {
558
559 ExecOutput o;
560 ExecInput i;
561 int r;
562
563 assert(unit);
564 assert(context);
565 assert(params);
566 assert(ident);
567 assert(journal_stream_dev);
568 assert(journal_stream_ino);
569
570 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
571
572 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
573 return -errno;
574
575 return STDOUT_FILENO;
576 }
577
578 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
579 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
580 return -errno;
581
582 return STDERR_FILENO;
583 }
584
585 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
586 o = fixup_output(context->std_output, socket_fd);
587
588 if (fileno == STDERR_FILENO) {
589 ExecOutput e;
590 e = fixup_output(context->std_error, socket_fd);
591
592 /* This expects the input and output are already set up */
593
594 /* Don't change the stderr file descriptor if we inherit all
595 * the way and are not on a tty */
596 if (e == EXEC_OUTPUT_INHERIT &&
597 o == EXEC_OUTPUT_INHERIT &&
598 i == EXEC_INPUT_NULL &&
599 !is_terminal_input(context->std_input) &&
600 getppid () != 1)
601 return fileno;
602
603 /* Duplicate from stdout if possible */
604 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
605 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
606
607 o = e;
608
609 } else if (o == EXEC_OUTPUT_INHERIT) {
610 /* If input got downgraded, inherit the original value */
611 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
612 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
613
614 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
615 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
616 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
617
618 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
619 if (getppid() != 1)
620 return fileno;
621
622 /* We need to open /dev/null here anew, to get the right access mode. */
623 return open_null_as(O_WRONLY, fileno);
624 }
625
626 switch (o) {
627
628 case EXEC_OUTPUT_NULL:
629 return open_null_as(O_WRONLY, fileno);
630
631 case EXEC_OUTPUT_TTY:
632 if (is_terminal_input(i))
633 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
634
635 /* We don't reset the terminal if this is just about output */
636 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
637
638 case EXEC_OUTPUT_SYSLOG:
639 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
640 case EXEC_OUTPUT_KMSG:
641 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
642 case EXEC_OUTPUT_JOURNAL:
643 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
644 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
645 if (r < 0) {
646 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
647 r = open_null_as(O_WRONLY, fileno);
648 } else {
649 struct stat st;
650
651 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
652 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
653 * services to detect whether they are connected to the journal or not.
654 *
655 * If both stdout and stderr are connected to a stream then let's make sure to store the data
656 * about STDERR as that's usually the best way to do logging. */
657
658 if (fstat(fileno, &st) >= 0 &&
659 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
660 *journal_stream_dev = st.st_dev;
661 *journal_stream_ino = st.st_ino;
662 }
663 }
664 return r;
665
666 case EXEC_OUTPUT_SOCKET:
667 assert(socket_fd >= 0);
668
669 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
670
671 case EXEC_OUTPUT_NAMED_FD:
672 assert(named_iofds[fileno] >= 0);
673
674 (void) fd_nonblock(named_iofds[fileno], false);
675 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
676
677 case EXEC_OUTPUT_FILE:
678 case EXEC_OUTPUT_FILE_APPEND: {
679 bool rw;
680 int fd, flags;
681
682 assert(context->stdio_file[fileno]);
683
684 rw = context->std_input == EXEC_INPUT_FILE &&
685 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
686
687 if (rw)
688 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
689
690 flags = O_WRONLY;
691 if (o == EXEC_OUTPUT_FILE_APPEND)
692 flags |= O_APPEND;
693
694 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
695
696 if (fd < 0)
697 return fd;
698
699 return move_fd(fd, fileno, 0);
700 }
701
702 default:
703 assert_not_reached("Unknown error type");
704 }
705 }
706
707 static int chown_terminal(int fd, uid_t uid) {
708 struct stat st;
709
710 assert(fd >= 0);
711
712 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
713 if (isatty(fd) < 1)
714 return 0;
715
716 /* This might fail. What matters are the results. */
717 (void) fchown(fd, uid, -1);
718 (void) fchmod(fd, TTY_MODE);
719
720 if (fstat(fd, &st) < 0)
721 return -errno;
722
723 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
724 return -EPERM;
725
726 return 0;
727 }
728
729 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
730 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
731 int r;
732
733 assert(_saved_stdin);
734 assert(_saved_stdout);
735
736 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
737 if (saved_stdin < 0)
738 return -errno;
739
740 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
741 if (saved_stdout < 0)
742 return -errno;
743
744 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
745 if (fd < 0)
746 return fd;
747
748 r = chown_terminal(fd, getuid());
749 if (r < 0)
750 return r;
751
752 r = reset_terminal_fd(fd, true);
753 if (r < 0)
754 return r;
755
756 r = rearrange_stdio(fd, fd, STDERR_FILENO);
757 fd = -1;
758 if (r < 0)
759 return r;
760
761 *_saved_stdin = saved_stdin;
762 *_saved_stdout = saved_stdout;
763
764 saved_stdin = saved_stdout = -1;
765
766 return 0;
767 }
768
769 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
770 assert(err < 0);
771
772 if (err == -ETIMEDOUT)
773 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
774 else {
775 errno = -err;
776 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
777 }
778 }
779
780 static void write_confirm_error(int err, const char *vc, const Unit *u) {
781 _cleanup_close_ int fd = -1;
782
783 assert(vc);
784
785 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
786 if (fd < 0)
787 return;
788
789 write_confirm_error_fd(err, fd, u);
790 }
791
792 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
793 int r = 0;
794
795 assert(saved_stdin);
796 assert(saved_stdout);
797
798 release_terminal();
799
800 if (*saved_stdin >= 0)
801 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
802 r = -errno;
803
804 if (*saved_stdout >= 0)
805 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
806 r = -errno;
807
808 *saved_stdin = safe_close(*saved_stdin);
809 *saved_stdout = safe_close(*saved_stdout);
810
811 return r;
812 }
813
814 enum {
815 CONFIRM_PRETEND_FAILURE = -1,
816 CONFIRM_PRETEND_SUCCESS = 0,
817 CONFIRM_EXECUTE = 1,
818 };
819
820 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
821 int saved_stdout = -1, saved_stdin = -1, r;
822 _cleanup_free_ char *e = NULL;
823 char c;
824
825 /* For any internal errors, assume a positive response. */
826 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
827 if (r < 0) {
828 write_confirm_error(r, vc, u);
829 return CONFIRM_EXECUTE;
830 }
831
832 /* confirm_spawn might have been disabled while we were sleeping. */
833 if (manager_is_confirm_spawn_disabled(u->manager)) {
834 r = 1;
835 goto restore_stdio;
836 }
837
838 e = ellipsize(cmdline, 60, 100);
839 if (!e) {
840 log_oom();
841 r = CONFIRM_EXECUTE;
842 goto restore_stdio;
843 }
844
845 for (;;) {
846 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
847 if (r < 0) {
848 write_confirm_error_fd(r, STDOUT_FILENO, u);
849 r = CONFIRM_EXECUTE;
850 goto restore_stdio;
851 }
852
853 switch (c) {
854 case 'c':
855 printf("Resuming normal execution.\n");
856 manager_disable_confirm_spawn();
857 r = 1;
858 break;
859 case 'D':
860 unit_dump(u, stdout, " ");
861 continue; /* ask again */
862 case 'f':
863 printf("Failing execution.\n");
864 r = CONFIRM_PRETEND_FAILURE;
865 break;
866 case 'h':
867 printf(" c - continue, proceed without asking anymore\n"
868 " D - dump, show the state of the unit\n"
869 " f - fail, don't execute the command and pretend it failed\n"
870 " h - help\n"
871 " i - info, show a short summary of the unit\n"
872 " j - jobs, show jobs that are in progress\n"
873 " s - skip, don't execute the command and pretend it succeeded\n"
874 " y - yes, execute the command\n");
875 continue; /* ask again */
876 case 'i':
877 printf(" Description: %s\n"
878 " Unit: %s\n"
879 " Command: %s\n",
880 u->id, u->description, cmdline);
881 continue; /* ask again */
882 case 'j':
883 manager_dump_jobs(u->manager, stdout, " ");
884 continue; /* ask again */
885 case 'n':
886 /* 'n' was removed in favor of 'f'. */
887 printf("Didn't understand 'n', did you mean 'f'?\n");
888 continue; /* ask again */
889 case 's':
890 printf("Skipping execution.\n");
891 r = CONFIRM_PRETEND_SUCCESS;
892 break;
893 case 'y':
894 r = CONFIRM_EXECUTE;
895 break;
896 default:
897 assert_not_reached("Unhandled choice");
898 }
899 break;
900 }
901
902 restore_stdio:
903 restore_confirm_stdio(&saved_stdin, &saved_stdout);
904 return r;
905 }
906
907 static int get_fixed_user(const ExecContext *c, const char **user,
908 uid_t *uid, gid_t *gid,
909 const char **home, const char **shell) {
910 int r;
911 const char *name;
912
913 assert(c);
914
915 if (!c->user)
916 return 0;
917
918 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
919 * (i.e. are "/" or "/bin/nologin"). */
920
921 name = c->user;
922 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
923 if (r < 0)
924 return r;
925
926 *user = name;
927 return 0;
928 }
929
930 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
931 int r;
932 const char *name;
933
934 assert(c);
935
936 if (!c->group)
937 return 0;
938
939 name = c->group;
940 r = get_group_creds(&name, gid, 0);
941 if (r < 0)
942 return r;
943
944 *group = name;
945 return 0;
946 }
947
948 static int get_supplementary_groups(const ExecContext *c, const char *user,
949 const char *group, gid_t gid,
950 gid_t **supplementary_gids, int *ngids) {
951 char **i;
952 int r, k = 0;
953 int ngroups_max;
954 bool keep_groups = false;
955 gid_t *groups = NULL;
956 _cleanup_free_ gid_t *l_gids = NULL;
957
958 assert(c);
959
960 /*
961 * If user is given, then lookup GID and supplementary groups list.
962 * We avoid NSS lookups for gid=0. Also we have to initialize groups
963 * here and as early as possible so we keep the list of supplementary
964 * groups of the caller.
965 */
966 if (user && gid_is_valid(gid) && gid != 0) {
967 /* First step, initialize groups from /etc/groups */
968 if (initgroups(user, gid) < 0)
969 return -errno;
970
971 keep_groups = true;
972 }
973
974 if (strv_isempty(c->supplementary_groups))
975 return 0;
976
977 /*
978 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
979 * be positive, otherwise fail.
980 */
981 errno = 0;
982 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
983 if (ngroups_max <= 0) {
984 if (errno > 0)
985 return -errno;
986 else
987 return -EOPNOTSUPP; /* For all other values */
988 }
989
990 l_gids = new(gid_t, ngroups_max);
991 if (!l_gids)
992 return -ENOMEM;
993
994 if (keep_groups) {
995 /*
996 * Lookup the list of groups that the user belongs to, we
997 * avoid NSS lookups here too for gid=0.
998 */
999 k = ngroups_max;
1000 if (getgrouplist(user, gid, l_gids, &k) < 0)
1001 return -EINVAL;
1002 } else
1003 k = 0;
1004
1005 STRV_FOREACH(i, c->supplementary_groups) {
1006 const char *g;
1007
1008 if (k >= ngroups_max)
1009 return -E2BIG;
1010
1011 g = *i;
1012 r = get_group_creds(&g, l_gids+k, 0);
1013 if (r < 0)
1014 return r;
1015
1016 k++;
1017 }
1018
1019 /*
1020 * Sets ngids to zero to drop all supplementary groups, happens
1021 * when we are under root and SupplementaryGroups= is empty.
1022 */
1023 if (k == 0) {
1024 *ngids = 0;
1025 return 0;
1026 }
1027
1028 /* Otherwise get the final list of supplementary groups */
1029 groups = memdup(l_gids, sizeof(gid_t) * k);
1030 if (!groups)
1031 return -ENOMEM;
1032
1033 *supplementary_gids = groups;
1034 *ngids = k;
1035
1036 groups = NULL;
1037
1038 return 0;
1039 }
1040
1041 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1042 int r;
1043
1044 /* Handle SupplementaryGroups= if it is not empty */
1045 if (ngids > 0) {
1046 r = maybe_setgroups(ngids, supplementary_gids);
1047 if (r < 0)
1048 return r;
1049 }
1050
1051 if (gid_is_valid(gid)) {
1052 /* Then set our gids */
1053 if (setresgid(gid, gid, gid) < 0)
1054 return -errno;
1055 }
1056
1057 return 0;
1058 }
1059
1060 static int enforce_user(const ExecContext *context, uid_t uid) {
1061 assert(context);
1062
1063 if (!uid_is_valid(uid))
1064 return 0;
1065
1066 /* Sets (but doesn't look up) the uid and make sure we keep the
1067 * capabilities while doing so. */
1068
1069 if (context->capability_ambient_set != 0) {
1070
1071 /* First step: If we need to keep capabilities but
1072 * drop privileges we need to make sure we keep our
1073 * caps, while we drop privileges. */
1074 if (uid != 0) {
1075 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1076
1077 if (prctl(PR_GET_SECUREBITS) != sb)
1078 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1079 return -errno;
1080 }
1081 }
1082
1083 /* Second step: actually set the uids */
1084 if (setresuid(uid, uid, uid) < 0)
1085 return -errno;
1086
1087 /* At this point we should have all necessary capabilities but
1088 are otherwise a normal user. However, the caps might got
1089 corrupted due to the setresuid() so we need clean them up
1090 later. This is done outside of this call. */
1091
1092 return 0;
1093 }
1094
1095 #if HAVE_PAM
1096
1097 static int null_conv(
1098 int num_msg,
1099 const struct pam_message **msg,
1100 struct pam_response **resp,
1101 void *appdata_ptr) {
1102
1103 /* We don't support conversations */
1104
1105 return PAM_CONV_ERR;
1106 }
1107
1108 #endif
1109
1110 static int setup_pam(
1111 const char *name,
1112 const char *user,
1113 uid_t uid,
1114 gid_t gid,
1115 const char *tty,
1116 char ***env,
1117 int fds[], size_t n_fds) {
1118
1119 #if HAVE_PAM
1120
1121 static const struct pam_conv conv = {
1122 .conv = null_conv,
1123 .appdata_ptr = NULL
1124 };
1125
1126 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1127 pam_handle_t *handle = NULL;
1128 sigset_t old_ss;
1129 int pam_code = PAM_SUCCESS, r;
1130 char **nv, **e = NULL;
1131 bool close_session = false;
1132 pid_t pam_pid = 0, parent_pid;
1133 int flags = 0;
1134
1135 assert(name);
1136 assert(user);
1137 assert(env);
1138
1139 /* We set up PAM in the parent process, then fork. The child
1140 * will then stay around until killed via PR_GET_PDEATHSIG or
1141 * systemd via the cgroup logic. It will then remove the PAM
1142 * session again. The parent process will exec() the actual
1143 * daemon. We do things this way to ensure that the main PID
1144 * of the daemon is the one we initially fork()ed. */
1145
1146 r = barrier_create(&barrier);
1147 if (r < 0)
1148 goto fail;
1149
1150 if (log_get_max_level() < LOG_DEBUG)
1151 flags |= PAM_SILENT;
1152
1153 pam_code = pam_start(name, user, &conv, &handle);
1154 if (pam_code != PAM_SUCCESS) {
1155 handle = NULL;
1156 goto fail;
1157 }
1158
1159 if (!tty) {
1160 _cleanup_free_ char *q = NULL;
1161
1162 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1163 * out if that's the case, and read the TTY off it. */
1164
1165 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1166 tty = strjoina("/dev/", q);
1167 }
1168
1169 if (tty) {
1170 pam_code = pam_set_item(handle, PAM_TTY, tty);
1171 if (pam_code != PAM_SUCCESS)
1172 goto fail;
1173 }
1174
1175 STRV_FOREACH(nv, *env) {
1176 pam_code = pam_putenv(handle, *nv);
1177 if (pam_code != PAM_SUCCESS)
1178 goto fail;
1179 }
1180
1181 pam_code = pam_acct_mgmt(handle, flags);
1182 if (pam_code != PAM_SUCCESS)
1183 goto fail;
1184
1185 pam_code = pam_open_session(handle, flags);
1186 if (pam_code != PAM_SUCCESS)
1187 goto fail;
1188
1189 close_session = true;
1190
1191 e = pam_getenvlist(handle);
1192 if (!e) {
1193 pam_code = PAM_BUF_ERR;
1194 goto fail;
1195 }
1196
1197 /* Block SIGTERM, so that we know that it won't get lost in
1198 * the child */
1199
1200 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1201
1202 parent_pid = getpid_cached();
1203
1204 r = safe_fork("(sd-pam)", 0, &pam_pid);
1205 if (r < 0)
1206 goto fail;
1207 if (r == 0) {
1208 int sig, ret = EXIT_PAM;
1209
1210 /* The child's job is to reset the PAM session on
1211 * termination */
1212 barrier_set_role(&barrier, BARRIER_CHILD);
1213
1214 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1215 * are open here that have been opened by PAM. */
1216 (void) close_many(fds, n_fds);
1217
1218 /* Drop privileges - we don't need any to pam_close_session
1219 * and this will make PR_SET_PDEATHSIG work in most cases.
1220 * If this fails, ignore the error - but expect sd-pam threads
1221 * to fail to exit normally */
1222
1223 r = maybe_setgroups(0, NULL);
1224 if (r < 0)
1225 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1226 if (setresgid(gid, gid, gid) < 0)
1227 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1228 if (setresuid(uid, uid, uid) < 0)
1229 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1230
1231 (void) ignore_signals(SIGPIPE, -1);
1232
1233 /* Wait until our parent died. This will only work if
1234 * the above setresuid() succeeds, otherwise the kernel
1235 * will not allow unprivileged parents kill their privileged
1236 * children this way. We rely on the control groups kill logic
1237 * to do the rest for us. */
1238 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1239 goto child_finish;
1240
1241 /* Tell the parent that our setup is done. This is especially
1242 * important regarding dropping privileges. Otherwise, unit
1243 * setup might race against our setresuid(2) call.
1244 *
1245 * If the parent aborted, we'll detect this below, hence ignore
1246 * return failure here. */
1247 (void) barrier_place(&barrier);
1248
1249 /* Check if our parent process might already have died? */
1250 if (getppid() == parent_pid) {
1251 sigset_t ss;
1252
1253 assert_se(sigemptyset(&ss) >= 0);
1254 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1255
1256 for (;;) {
1257 if (sigwait(&ss, &sig) < 0) {
1258 if (errno == EINTR)
1259 continue;
1260
1261 goto child_finish;
1262 }
1263
1264 assert(sig == SIGTERM);
1265 break;
1266 }
1267 }
1268
1269 /* If our parent died we'll end the session */
1270 if (getppid() != parent_pid) {
1271 pam_code = pam_close_session(handle, flags);
1272 if (pam_code != PAM_SUCCESS)
1273 goto child_finish;
1274 }
1275
1276 ret = 0;
1277
1278 child_finish:
1279 pam_end(handle, pam_code | flags);
1280 _exit(ret);
1281 }
1282
1283 barrier_set_role(&barrier, BARRIER_PARENT);
1284
1285 /* If the child was forked off successfully it will do all the
1286 * cleanups, so forget about the handle here. */
1287 handle = NULL;
1288
1289 /* Unblock SIGTERM again in the parent */
1290 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1291
1292 /* We close the log explicitly here, since the PAM modules
1293 * might have opened it, but we don't want this fd around. */
1294 closelog();
1295
1296 /* Synchronously wait for the child to initialize. We don't care for
1297 * errors as we cannot recover. However, warn loudly if it happens. */
1298 if (!barrier_place_and_sync(&barrier))
1299 log_error("PAM initialization failed");
1300
1301 return strv_free_and_replace(*env, e);
1302
1303 fail:
1304 if (pam_code != PAM_SUCCESS) {
1305 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1306 r = -EPERM; /* PAM errors do not map to errno */
1307 } else
1308 log_error_errno(r, "PAM failed: %m");
1309
1310 if (handle) {
1311 if (close_session)
1312 pam_code = pam_close_session(handle, flags);
1313
1314 pam_end(handle, pam_code | flags);
1315 }
1316
1317 strv_free(e);
1318 closelog();
1319
1320 return r;
1321 #else
1322 return 0;
1323 #endif
1324 }
1325
1326 static void rename_process_from_path(const char *path) {
1327 char process_name[11];
1328 const char *p;
1329 size_t l;
1330
1331 /* This resulting string must fit in 10 chars (i.e. the length
1332 * of "/sbin/init") to look pretty in /bin/ps */
1333
1334 p = basename(path);
1335 if (isempty(p)) {
1336 rename_process("(...)");
1337 return;
1338 }
1339
1340 l = strlen(p);
1341 if (l > 8) {
1342 /* The end of the process name is usually more
1343 * interesting, since the first bit might just be
1344 * "systemd-" */
1345 p = p + l - 8;
1346 l = 8;
1347 }
1348
1349 process_name[0] = '(';
1350 memcpy(process_name+1, p, l);
1351 process_name[1+l] = ')';
1352 process_name[1+l+1] = 0;
1353
1354 rename_process(process_name);
1355 }
1356
1357 static bool context_has_address_families(const ExecContext *c) {
1358 assert(c);
1359
1360 return c->address_families_whitelist ||
1361 !set_isempty(c->address_families);
1362 }
1363
1364 static bool context_has_syscall_filters(const ExecContext *c) {
1365 assert(c);
1366
1367 return c->syscall_whitelist ||
1368 !hashmap_isempty(c->syscall_filter);
1369 }
1370
1371 static bool context_has_no_new_privileges(const ExecContext *c) {
1372 assert(c);
1373
1374 if (c->no_new_privileges)
1375 return true;
1376
1377 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1378 return false;
1379
1380 /* We need NNP if we have any form of seccomp and are unprivileged */
1381 return context_has_address_families(c) ||
1382 c->memory_deny_write_execute ||
1383 c->restrict_realtime ||
1384 exec_context_restrict_namespaces_set(c) ||
1385 c->protect_kernel_tunables ||
1386 c->protect_kernel_modules ||
1387 c->private_devices ||
1388 context_has_syscall_filters(c) ||
1389 !set_isempty(c->syscall_archs) ||
1390 c->lock_personality;
1391 }
1392
1393 #if HAVE_SECCOMP
1394
1395 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1396
1397 if (is_seccomp_available())
1398 return false;
1399
1400 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1401 return true;
1402 }
1403
1404 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1405 uint32_t negative_action, default_action, action;
1406 int r;
1407
1408 assert(u);
1409 assert(c);
1410
1411 if (!context_has_syscall_filters(c))
1412 return 0;
1413
1414 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1415 return 0;
1416
1417 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1418
1419 if (c->syscall_whitelist) {
1420 default_action = negative_action;
1421 action = SCMP_ACT_ALLOW;
1422 } else {
1423 default_action = SCMP_ACT_ALLOW;
1424 action = negative_action;
1425 }
1426
1427 if (needs_ambient_hack) {
1428 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1429 if (r < 0)
1430 return r;
1431 }
1432
1433 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1434 }
1435
1436 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1437 assert(u);
1438 assert(c);
1439
1440 if (set_isempty(c->syscall_archs))
1441 return 0;
1442
1443 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1444 return 0;
1445
1446 return seccomp_restrict_archs(c->syscall_archs);
1447 }
1448
1449 static int apply_address_families(const Unit* u, const ExecContext *c) {
1450 assert(u);
1451 assert(c);
1452
1453 if (!context_has_address_families(c))
1454 return 0;
1455
1456 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1457 return 0;
1458
1459 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1460 }
1461
1462 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1463 assert(u);
1464 assert(c);
1465
1466 if (!c->memory_deny_write_execute)
1467 return 0;
1468
1469 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1470 return 0;
1471
1472 return seccomp_memory_deny_write_execute();
1473 }
1474
1475 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1476 assert(u);
1477 assert(c);
1478
1479 if (!c->restrict_realtime)
1480 return 0;
1481
1482 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1483 return 0;
1484
1485 return seccomp_restrict_realtime();
1486 }
1487
1488 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1489 assert(u);
1490 assert(c);
1491
1492 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1493 * let's protect even those systems where this is left on in the kernel. */
1494
1495 if (!c->protect_kernel_tunables)
1496 return 0;
1497
1498 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1499 return 0;
1500
1501 return seccomp_protect_sysctl();
1502 }
1503
1504 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1505 assert(u);
1506 assert(c);
1507
1508 /* Turn off module syscalls on ProtectKernelModules=yes */
1509
1510 if (!c->protect_kernel_modules)
1511 return 0;
1512
1513 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1514 return 0;
1515
1516 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1517 }
1518
1519 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1520 assert(u);
1521 assert(c);
1522
1523 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1524
1525 if (!c->private_devices)
1526 return 0;
1527
1528 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1529 return 0;
1530
1531 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1532 }
1533
1534 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1535 assert(u);
1536 assert(c);
1537
1538 if (!exec_context_restrict_namespaces_set(c))
1539 return 0;
1540
1541 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1542 return 0;
1543
1544 return seccomp_restrict_namespaces(c->restrict_namespaces);
1545 }
1546
1547 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1548 unsigned long personality;
1549 int r;
1550
1551 assert(u);
1552 assert(c);
1553
1554 if (!c->lock_personality)
1555 return 0;
1556
1557 if (skip_seccomp_unavailable(u, "LockPersonality="))
1558 return 0;
1559
1560 personality = c->personality;
1561
1562 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1563 if (personality == PERSONALITY_INVALID) {
1564
1565 r = opinionated_personality(&personality);
1566 if (r < 0)
1567 return r;
1568 }
1569
1570 return seccomp_lock_personality(personality);
1571 }
1572
1573 #endif
1574
1575 static void do_idle_pipe_dance(int idle_pipe[4]) {
1576 assert(idle_pipe);
1577
1578 idle_pipe[1] = safe_close(idle_pipe[1]);
1579 idle_pipe[2] = safe_close(idle_pipe[2]);
1580
1581 if (idle_pipe[0] >= 0) {
1582 int r;
1583
1584 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1585
1586 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1587 ssize_t n;
1588
1589 /* Signal systemd that we are bored and want to continue. */
1590 n = write(idle_pipe[3], "x", 1);
1591 if (n > 0)
1592 /* Wait for systemd to react to the signal above. */
1593 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1594 }
1595
1596 idle_pipe[0] = safe_close(idle_pipe[0]);
1597
1598 }
1599
1600 idle_pipe[3] = safe_close(idle_pipe[3]);
1601 }
1602
1603 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1604
1605 static int build_environment(
1606 const Unit *u,
1607 const ExecContext *c,
1608 const ExecParameters *p,
1609 size_t n_fds,
1610 const char *home,
1611 const char *username,
1612 const char *shell,
1613 dev_t journal_stream_dev,
1614 ino_t journal_stream_ino,
1615 char ***ret) {
1616
1617 _cleanup_strv_free_ char **our_env = NULL;
1618 ExecDirectoryType t;
1619 size_t n_env = 0;
1620 char *x;
1621
1622 assert(u);
1623 assert(c);
1624 assert(p);
1625 assert(ret);
1626
1627 our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1628 if (!our_env)
1629 return -ENOMEM;
1630
1631 if (n_fds > 0) {
1632 _cleanup_free_ char *joined = NULL;
1633
1634 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1635 return -ENOMEM;
1636 our_env[n_env++] = x;
1637
1638 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1639 return -ENOMEM;
1640 our_env[n_env++] = x;
1641
1642 joined = strv_join(p->fd_names, ":");
1643 if (!joined)
1644 return -ENOMEM;
1645
1646 x = strjoin("LISTEN_FDNAMES=", joined);
1647 if (!x)
1648 return -ENOMEM;
1649 our_env[n_env++] = x;
1650 }
1651
1652 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1653 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1654 return -ENOMEM;
1655 our_env[n_env++] = x;
1656
1657 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1658 return -ENOMEM;
1659 our_env[n_env++] = x;
1660 }
1661
1662 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1663 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1664 * check the database directly. */
1665 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1666 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1667 if (!x)
1668 return -ENOMEM;
1669 our_env[n_env++] = x;
1670 }
1671
1672 if (home) {
1673 x = strappend("HOME=", home);
1674 if (!x)
1675 return -ENOMEM;
1676 our_env[n_env++] = x;
1677 }
1678
1679 if (username) {
1680 x = strappend("LOGNAME=", username);
1681 if (!x)
1682 return -ENOMEM;
1683 our_env[n_env++] = x;
1684
1685 x = strappend("USER=", username);
1686 if (!x)
1687 return -ENOMEM;
1688 our_env[n_env++] = x;
1689 }
1690
1691 if (shell) {
1692 x = strappend("SHELL=", shell);
1693 if (!x)
1694 return -ENOMEM;
1695 our_env[n_env++] = x;
1696 }
1697
1698 if (!sd_id128_is_null(u->invocation_id)) {
1699 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1700 return -ENOMEM;
1701
1702 our_env[n_env++] = x;
1703 }
1704
1705 if (exec_context_needs_term(c)) {
1706 const char *tty_path, *term = NULL;
1707
1708 tty_path = exec_context_tty_path(c);
1709
1710 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1711 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1712 * passes to PID 1 ends up all the way in the console login shown. */
1713
1714 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1715 term = getenv("TERM");
1716 if (!term)
1717 term = default_term_for_tty(tty_path);
1718
1719 x = strappend("TERM=", term);
1720 if (!x)
1721 return -ENOMEM;
1722 our_env[n_env++] = x;
1723 }
1724
1725 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1726 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1727 return -ENOMEM;
1728
1729 our_env[n_env++] = x;
1730 }
1731
1732 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1733 _cleanup_free_ char *pre = NULL, *joined = NULL;
1734 const char *n;
1735
1736 if (!p->prefix[t])
1737 continue;
1738
1739 if (strv_isempty(c->directories[t].paths))
1740 continue;
1741
1742 n = exec_directory_env_name_to_string(t);
1743 if (!n)
1744 continue;
1745
1746 pre = strjoin(p->prefix[t], "/");
1747 if (!pre)
1748 return -ENOMEM;
1749
1750 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1751 if (!joined)
1752 return -ENOMEM;
1753
1754 x = strjoin(n, "=", joined);
1755 if (!x)
1756 return -ENOMEM;
1757
1758 our_env[n_env++] = x;
1759 }
1760
1761 our_env[n_env++] = NULL;
1762 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1763
1764 *ret = TAKE_PTR(our_env);
1765
1766 return 0;
1767 }
1768
1769 static int build_pass_environment(const ExecContext *c, char ***ret) {
1770 _cleanup_strv_free_ char **pass_env = NULL;
1771 size_t n_env = 0, n_bufsize = 0;
1772 char **i;
1773
1774 STRV_FOREACH(i, c->pass_environment) {
1775 _cleanup_free_ char *x = NULL;
1776 char *v;
1777
1778 v = getenv(*i);
1779 if (!v)
1780 continue;
1781 x = strjoin(*i, "=", v);
1782 if (!x)
1783 return -ENOMEM;
1784
1785 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1786 return -ENOMEM;
1787
1788 pass_env[n_env++] = TAKE_PTR(x);
1789 pass_env[n_env] = NULL;
1790 }
1791
1792 *ret = TAKE_PTR(pass_env);
1793
1794 return 0;
1795 }
1796
1797 static bool exec_needs_mount_namespace(
1798 const ExecContext *context,
1799 const ExecParameters *params,
1800 const ExecRuntime *runtime) {
1801
1802 assert(context);
1803 assert(params);
1804
1805 if (context->root_image)
1806 return true;
1807
1808 if (!strv_isempty(context->read_write_paths) ||
1809 !strv_isempty(context->read_only_paths) ||
1810 !strv_isempty(context->inaccessible_paths))
1811 return true;
1812
1813 if (context->n_bind_mounts > 0)
1814 return true;
1815
1816 if (context->n_temporary_filesystems > 0)
1817 return true;
1818
1819 if (context->mount_flags != 0)
1820 return true;
1821
1822 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1823 return true;
1824
1825 if (context->private_devices ||
1826 context->private_mounts ||
1827 context->protect_system != PROTECT_SYSTEM_NO ||
1828 context->protect_home != PROTECT_HOME_NO ||
1829 context->protect_kernel_tunables ||
1830 context->protect_kernel_modules ||
1831 context->protect_control_groups)
1832 return true;
1833
1834 if (context->root_directory) {
1835 ExecDirectoryType t;
1836
1837 if (context->mount_apivfs)
1838 return true;
1839
1840 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1841 if (!params->prefix[t])
1842 continue;
1843
1844 if (!strv_isempty(context->directories[t].paths))
1845 return true;
1846 }
1847 }
1848
1849 if (context->dynamic_user &&
1850 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1851 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1852 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1853 return true;
1854
1855 return false;
1856 }
1857
1858 static int setup_private_users(uid_t uid, gid_t gid) {
1859 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1860 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1861 _cleanup_close_ int unshare_ready_fd = -1;
1862 _cleanup_(sigkill_waitp) pid_t pid = 0;
1863 uint64_t c = 1;
1864 ssize_t n;
1865 int r;
1866
1867 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1868 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1869 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1870 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1871 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1872 * continues execution normally. */
1873
1874 if (uid != 0 && uid_is_valid(uid)) {
1875 r = asprintf(&uid_map,
1876 "0 0 1\n" /* Map root → root */
1877 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1878 uid, uid);
1879 if (r < 0)
1880 return -ENOMEM;
1881 } else {
1882 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1883 if (!uid_map)
1884 return -ENOMEM;
1885 }
1886
1887 if (gid != 0 && gid_is_valid(gid)) {
1888 r = asprintf(&gid_map,
1889 "0 0 1\n" /* Map root → root */
1890 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1891 gid, gid);
1892 if (r < 0)
1893 return -ENOMEM;
1894 } else {
1895 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1896 if (!gid_map)
1897 return -ENOMEM;
1898 }
1899
1900 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1901 * namespace. */
1902 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1903 if (unshare_ready_fd < 0)
1904 return -errno;
1905
1906 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1907 * failed. */
1908 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1909 return -errno;
1910
1911 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1912 if (r < 0)
1913 return r;
1914 if (r == 0) {
1915 _cleanup_close_ int fd = -1;
1916 const char *a;
1917 pid_t ppid;
1918
1919 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1920 * here, after the parent opened its own user namespace. */
1921
1922 ppid = getppid();
1923 errno_pipe[0] = safe_close(errno_pipe[0]);
1924
1925 /* Wait until the parent unshared the user namespace */
1926 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1927 r = -errno;
1928 goto child_fail;
1929 }
1930
1931 /* Disable the setgroups() system call in the child user namespace, for good. */
1932 a = procfs_file_alloca(ppid, "setgroups");
1933 fd = open(a, O_WRONLY|O_CLOEXEC);
1934 if (fd < 0) {
1935 if (errno != ENOENT) {
1936 r = -errno;
1937 goto child_fail;
1938 }
1939
1940 /* If the file is missing the kernel is too old, let's continue anyway. */
1941 } else {
1942 if (write(fd, "deny\n", 5) < 0) {
1943 r = -errno;
1944 goto child_fail;
1945 }
1946
1947 fd = safe_close(fd);
1948 }
1949
1950 /* First write the GID map */
1951 a = procfs_file_alloca(ppid, "gid_map");
1952 fd = open(a, O_WRONLY|O_CLOEXEC);
1953 if (fd < 0) {
1954 r = -errno;
1955 goto child_fail;
1956 }
1957 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1958 r = -errno;
1959 goto child_fail;
1960 }
1961 fd = safe_close(fd);
1962
1963 /* The write the UID map */
1964 a = procfs_file_alloca(ppid, "uid_map");
1965 fd = open(a, O_WRONLY|O_CLOEXEC);
1966 if (fd < 0) {
1967 r = -errno;
1968 goto child_fail;
1969 }
1970 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1971 r = -errno;
1972 goto child_fail;
1973 }
1974
1975 _exit(EXIT_SUCCESS);
1976
1977 child_fail:
1978 (void) write(errno_pipe[1], &r, sizeof(r));
1979 _exit(EXIT_FAILURE);
1980 }
1981
1982 errno_pipe[1] = safe_close(errno_pipe[1]);
1983
1984 if (unshare(CLONE_NEWUSER) < 0)
1985 return -errno;
1986
1987 /* Let the child know that the namespace is ready now */
1988 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1989 return -errno;
1990
1991 /* Try to read an error code from the child */
1992 n = read(errno_pipe[0], &r, sizeof(r));
1993 if (n < 0)
1994 return -errno;
1995 if (n == sizeof(r)) { /* an error code was sent to us */
1996 if (r < 0)
1997 return r;
1998 return -EIO;
1999 }
2000 if (n != 0) /* on success we should have read 0 bytes */
2001 return -EIO;
2002
2003 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2004 pid = 0;
2005 if (r < 0)
2006 return r;
2007 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2008 return -EIO;
2009
2010 return 0;
2011 }
2012
2013 static int setup_exec_directory(
2014 const ExecContext *context,
2015 const ExecParameters *params,
2016 uid_t uid,
2017 gid_t gid,
2018 ExecDirectoryType type,
2019 int *exit_status) {
2020
2021 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2022 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2023 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2024 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2025 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2026 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2027 };
2028 char **rt;
2029 int r;
2030
2031 assert(context);
2032 assert(params);
2033 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2034 assert(exit_status);
2035
2036 if (!params->prefix[type])
2037 return 0;
2038
2039 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2040 if (!uid_is_valid(uid))
2041 uid = 0;
2042 if (!gid_is_valid(gid))
2043 gid = 0;
2044 }
2045
2046 STRV_FOREACH(rt, context->directories[type].paths) {
2047 _cleanup_free_ char *p = NULL, *pp = NULL;
2048
2049 p = strjoin(params->prefix[type], "/", *rt);
2050 if (!p) {
2051 r = -ENOMEM;
2052 goto fail;
2053 }
2054
2055 r = mkdir_parents_label(p, 0755);
2056 if (r < 0)
2057 goto fail;
2058
2059 if (context->dynamic_user &&
2060 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2061 _cleanup_free_ char *private_root = NULL;
2062
2063 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2064 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2065 * whose UID is later on reused. To lock this down we use the same trick used by container
2066 * managers to prohibit host users to get access to files of the same UID in containers: we
2067 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2068 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2069 * to make this directory permeable for the service itself.
2070 *
2071 * Specifically: for a service which wants a special directory "foo/" we first create a
2072 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2073 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2074 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2075 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2076 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2077 * disabling the access boundary for the service and making sure it only gets access to the
2078 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2079 *
2080 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2081 * owned by the service itself.
2082 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2083 * files or sockets with other services. */
2084
2085 private_root = strjoin(params->prefix[type], "/private");
2086 if (!private_root) {
2087 r = -ENOMEM;
2088 goto fail;
2089 }
2090
2091 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2092 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2093 if (r < 0)
2094 goto fail;
2095
2096 pp = strjoin(private_root, "/", *rt);
2097 if (!pp) {
2098 r = -ENOMEM;
2099 goto fail;
2100 }
2101
2102 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2103 r = mkdir_parents_label(pp, 0755);
2104 if (r < 0)
2105 goto fail;
2106
2107 if (is_dir(p, false) > 0 &&
2108 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2109
2110 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2111 * it over. Most likely the service has been upgraded from one that didn't use
2112 * DynamicUser=1, to one that does. */
2113
2114 if (rename(p, pp) < 0) {
2115 r = -errno;
2116 goto fail;
2117 }
2118 } else {
2119 /* Otherwise, create the actual directory for the service */
2120
2121 r = mkdir_label(pp, context->directories[type].mode);
2122 if (r < 0 && r != -EEXIST)
2123 goto fail;
2124 }
2125
2126 /* And link it up from the original place */
2127 r = symlink_idempotent(pp, p, true);
2128 if (r < 0)
2129 goto fail;
2130
2131 /* Lock down the access mode */
2132 if (chmod(pp, context->directories[type].mode) < 0) {
2133 r = -errno;
2134 goto fail;
2135 }
2136 } else {
2137 r = mkdir_label(p, context->directories[type].mode);
2138 if (r < 0 && r != -EEXIST)
2139 goto fail;
2140 if (r == -EEXIST && !context->dynamic_user)
2141 continue;
2142 }
2143
2144 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2145 * a service, and shall not be writable. */
2146 if (type == EXEC_DIRECTORY_CONFIGURATION)
2147 continue;
2148
2149 /* Then, change the ownership of the whole tree, if necessary */
2150 r = path_chown_recursive(pp ?: p, uid, gid);
2151 if (r < 0)
2152 goto fail;
2153 }
2154
2155 return 0;
2156
2157 fail:
2158 *exit_status = exit_status_table[type];
2159 return r;
2160 }
2161
2162 #if ENABLE_SMACK
2163 static int setup_smack(
2164 const ExecContext *context,
2165 const ExecCommand *command) {
2166
2167 int r;
2168
2169 assert(context);
2170 assert(command);
2171
2172 if (context->smack_process_label) {
2173 r = mac_smack_apply_pid(0, context->smack_process_label);
2174 if (r < 0)
2175 return r;
2176 }
2177 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2178 else {
2179 _cleanup_free_ char *exec_label = NULL;
2180
2181 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2182 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2183 return r;
2184
2185 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2186 if (r < 0)
2187 return r;
2188 }
2189 #endif
2190
2191 return 0;
2192 }
2193 #endif
2194
2195 static int compile_bind_mounts(
2196 const ExecContext *context,
2197 const ExecParameters *params,
2198 BindMount **ret_bind_mounts,
2199 size_t *ret_n_bind_mounts,
2200 char ***ret_empty_directories) {
2201
2202 _cleanup_strv_free_ char **empty_directories = NULL;
2203 BindMount *bind_mounts;
2204 size_t n, h = 0, i;
2205 ExecDirectoryType t;
2206 int r;
2207
2208 assert(context);
2209 assert(params);
2210 assert(ret_bind_mounts);
2211 assert(ret_n_bind_mounts);
2212 assert(ret_empty_directories);
2213
2214 n = context->n_bind_mounts;
2215 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2216 if (!params->prefix[t])
2217 continue;
2218
2219 n += strv_length(context->directories[t].paths);
2220 }
2221
2222 if (n <= 0) {
2223 *ret_bind_mounts = NULL;
2224 *ret_n_bind_mounts = 0;
2225 *ret_empty_directories = NULL;
2226 return 0;
2227 }
2228
2229 bind_mounts = new(BindMount, n);
2230 if (!bind_mounts)
2231 return -ENOMEM;
2232
2233 for (i = 0; i < context->n_bind_mounts; i++) {
2234 BindMount *item = context->bind_mounts + i;
2235 char *s, *d;
2236
2237 s = strdup(item->source);
2238 if (!s) {
2239 r = -ENOMEM;
2240 goto finish;
2241 }
2242
2243 d = strdup(item->destination);
2244 if (!d) {
2245 free(s);
2246 r = -ENOMEM;
2247 goto finish;
2248 }
2249
2250 bind_mounts[h++] = (BindMount) {
2251 .source = s,
2252 .destination = d,
2253 .read_only = item->read_only,
2254 .recursive = item->recursive,
2255 .ignore_enoent = item->ignore_enoent,
2256 };
2257 }
2258
2259 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2260 char **suffix;
2261
2262 if (!params->prefix[t])
2263 continue;
2264
2265 if (strv_isempty(context->directories[t].paths))
2266 continue;
2267
2268 if (context->dynamic_user &&
2269 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2270 !(context->root_directory || context->root_image)) {
2271 char *private_root;
2272
2273 /* So this is for a dynamic user, and we need to make sure the process can access its own
2274 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2275 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2276
2277 private_root = strjoin(params->prefix[t], "/private");
2278 if (!private_root) {
2279 r = -ENOMEM;
2280 goto finish;
2281 }
2282
2283 r = strv_consume(&empty_directories, private_root);
2284 if (r < 0)
2285 goto finish;
2286 }
2287
2288 STRV_FOREACH(suffix, context->directories[t].paths) {
2289 char *s, *d;
2290
2291 if (context->dynamic_user &&
2292 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2293 s = strjoin(params->prefix[t], "/private/", *suffix);
2294 else
2295 s = strjoin(params->prefix[t], "/", *suffix);
2296 if (!s) {
2297 r = -ENOMEM;
2298 goto finish;
2299 }
2300
2301 if (context->dynamic_user &&
2302 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2303 (context->root_directory || context->root_image))
2304 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2305 * directory is not created on the root directory. So, let's bind-mount the directory
2306 * on the 'non-private' place. */
2307 d = strjoin(params->prefix[t], "/", *suffix);
2308 else
2309 d = strdup(s);
2310 if (!d) {
2311 free(s);
2312 r = -ENOMEM;
2313 goto finish;
2314 }
2315
2316 bind_mounts[h++] = (BindMount) {
2317 .source = s,
2318 .destination = d,
2319 .read_only = false,
2320 .recursive = true,
2321 .ignore_enoent = false,
2322 };
2323 }
2324 }
2325
2326 assert(h == n);
2327
2328 *ret_bind_mounts = bind_mounts;
2329 *ret_n_bind_mounts = n;
2330 *ret_empty_directories = TAKE_PTR(empty_directories);
2331
2332 return (int) n;
2333
2334 finish:
2335 bind_mount_free_many(bind_mounts, h);
2336 return r;
2337 }
2338
2339 static int apply_mount_namespace(
2340 const Unit *u,
2341 const ExecCommand *command,
2342 const ExecContext *context,
2343 const ExecParameters *params,
2344 const ExecRuntime *runtime) {
2345
2346 _cleanup_strv_free_ char **empty_directories = NULL;
2347 char *tmp = NULL, *var = NULL;
2348 const char *root_dir = NULL, *root_image = NULL;
2349 NamespaceInfo ns_info;
2350 bool needs_sandboxing;
2351 BindMount *bind_mounts = NULL;
2352 size_t n_bind_mounts = 0;
2353 int r;
2354
2355 assert(context);
2356
2357 /* The runtime struct only contains the parent of the private /tmp,
2358 * which is non-accessible to world users. Inside of it there's a /tmp
2359 * that is sticky, and that's the one we want to use here. */
2360
2361 if (context->private_tmp && runtime) {
2362 if (runtime->tmp_dir)
2363 tmp = strjoina(runtime->tmp_dir, "/tmp");
2364 if (runtime->var_tmp_dir)
2365 var = strjoina(runtime->var_tmp_dir, "/tmp");
2366 }
2367
2368 if (params->flags & EXEC_APPLY_CHROOT) {
2369 root_image = context->root_image;
2370
2371 if (!root_image)
2372 root_dir = context->root_directory;
2373 }
2374
2375 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2376 if (r < 0)
2377 return r;
2378
2379 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2380 if (needs_sandboxing)
2381 ns_info = (NamespaceInfo) {
2382 .ignore_protect_paths = false,
2383 .private_dev = context->private_devices,
2384 .protect_control_groups = context->protect_control_groups,
2385 .protect_kernel_tunables = context->protect_kernel_tunables,
2386 .protect_kernel_modules = context->protect_kernel_modules,
2387 .mount_apivfs = context->mount_apivfs,
2388 .private_mounts = context->private_mounts,
2389 };
2390 else if (!context->dynamic_user && root_dir)
2391 /*
2392 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2393 * sandbox info, otherwise enforce it, don't ignore protected paths and
2394 * fail if we are enable to apply the sandbox inside the mount namespace.
2395 */
2396 ns_info = (NamespaceInfo) {
2397 .ignore_protect_paths = true,
2398 };
2399 else
2400 ns_info = (NamespaceInfo) {};
2401
2402 r = setup_namespace(root_dir, root_image,
2403 &ns_info, context->read_write_paths,
2404 needs_sandboxing ? context->read_only_paths : NULL,
2405 needs_sandboxing ? context->inaccessible_paths : NULL,
2406 empty_directories,
2407 bind_mounts,
2408 n_bind_mounts,
2409 context->temporary_filesystems,
2410 context->n_temporary_filesystems,
2411 tmp,
2412 var,
2413 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2414 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2415 context->mount_flags,
2416 DISSECT_IMAGE_DISCARD_ON_LOOP);
2417
2418 bind_mount_free_many(bind_mounts, n_bind_mounts);
2419
2420 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2421 * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
2422 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2423 * completely different execution environment. */
2424 if (r == -ENOANO) {
2425 if (n_bind_mounts == 0 &&
2426 context->n_temporary_filesystems == 0 &&
2427 !root_dir && !root_image &&
2428 !context->dynamic_user) {
2429 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2430 return 0;
2431 }
2432
2433 return -EOPNOTSUPP;
2434 }
2435
2436 return r;
2437 }
2438
2439 static int apply_working_directory(
2440 const ExecContext *context,
2441 const ExecParameters *params,
2442 const char *home,
2443 const bool needs_mount_ns,
2444 int *exit_status) {
2445
2446 const char *d, *wd;
2447
2448 assert(context);
2449 assert(exit_status);
2450
2451 if (context->working_directory_home) {
2452
2453 if (!home) {
2454 *exit_status = EXIT_CHDIR;
2455 return -ENXIO;
2456 }
2457
2458 wd = home;
2459
2460 } else if (context->working_directory)
2461 wd = context->working_directory;
2462 else
2463 wd = "/";
2464
2465 if (params->flags & EXEC_APPLY_CHROOT) {
2466 if (!needs_mount_ns && context->root_directory)
2467 if (chroot(context->root_directory) < 0) {
2468 *exit_status = EXIT_CHROOT;
2469 return -errno;
2470 }
2471
2472 d = wd;
2473 } else
2474 d = prefix_roota(context->root_directory, wd);
2475
2476 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2477 *exit_status = EXIT_CHDIR;
2478 return -errno;
2479 }
2480
2481 return 0;
2482 }
2483
2484 static int setup_keyring(
2485 const Unit *u,
2486 const ExecContext *context,
2487 const ExecParameters *p,
2488 uid_t uid, gid_t gid) {
2489
2490 key_serial_t keyring;
2491 int r = 0;
2492 uid_t saved_uid;
2493 gid_t saved_gid;
2494
2495 assert(u);
2496 assert(context);
2497 assert(p);
2498
2499 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2500 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2501 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2502 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2503 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2504 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2505
2506 if (!(p->flags & EXEC_NEW_KEYRING))
2507 return 0;
2508
2509 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2510 return 0;
2511
2512 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2513 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2514 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2515 * & group is just as nasty as acquiring a reference to the user keyring. */
2516
2517 saved_uid = getuid();
2518 saved_gid = getgid();
2519
2520 if (gid_is_valid(gid) && gid != saved_gid) {
2521 if (setregid(gid, -1) < 0)
2522 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2523 }
2524
2525 if (uid_is_valid(uid) && uid != saved_uid) {
2526 if (setreuid(uid, -1) < 0) {
2527 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2528 goto out;
2529 }
2530 }
2531
2532 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2533 if (keyring == -1) {
2534 if (errno == ENOSYS)
2535 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2536 else if (IN_SET(errno, EACCES, EPERM))
2537 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2538 else if (errno == EDQUOT)
2539 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2540 else
2541 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2542
2543 goto out;
2544 }
2545
2546 /* When requested link the user keyring into the session keyring. */
2547 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2548
2549 if (keyctl(KEYCTL_LINK,
2550 KEY_SPEC_USER_KEYRING,
2551 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2552 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2553 goto out;
2554 }
2555 }
2556
2557 /* Restore uid/gid back */
2558 if (uid_is_valid(uid) && uid != saved_uid) {
2559 if (setreuid(saved_uid, -1) < 0) {
2560 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2561 goto out;
2562 }
2563 }
2564
2565 if (gid_is_valid(gid) && gid != saved_gid) {
2566 if (setregid(saved_gid, -1) < 0)
2567 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2568 }
2569
2570 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2571 if (!sd_id128_is_null(u->invocation_id)) {
2572 key_serial_t key;
2573
2574 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2575 if (key == -1)
2576 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2577 else {
2578 if (keyctl(KEYCTL_SETPERM, key,
2579 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2580 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2581 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2582 }
2583 }
2584
2585 out:
2586 /* Revert back uid & gid for the the last time, and exit */
2587 /* no extra logging, as only the first already reported error matters */
2588 if (getuid() != saved_uid)
2589 (void) setreuid(saved_uid, -1);
2590
2591 if (getgid() != saved_gid)
2592 (void) setregid(saved_gid, -1);
2593
2594 return r;
2595 }
2596
2597 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2598 assert(array);
2599 assert(n);
2600
2601 if (!pair)
2602 return;
2603
2604 if (pair[0] >= 0)
2605 array[(*n)++] = pair[0];
2606 if (pair[1] >= 0)
2607 array[(*n)++] = pair[1];
2608 }
2609
2610 static int close_remaining_fds(
2611 const ExecParameters *params,
2612 const ExecRuntime *runtime,
2613 const DynamicCreds *dcreds,
2614 int user_lookup_fd,
2615 int socket_fd,
2616 int exec_fd,
2617 int *fds, size_t n_fds) {
2618
2619 size_t n_dont_close = 0;
2620 int dont_close[n_fds + 12];
2621
2622 assert(params);
2623
2624 if (params->stdin_fd >= 0)
2625 dont_close[n_dont_close++] = params->stdin_fd;
2626 if (params->stdout_fd >= 0)
2627 dont_close[n_dont_close++] = params->stdout_fd;
2628 if (params->stderr_fd >= 0)
2629 dont_close[n_dont_close++] = params->stderr_fd;
2630
2631 if (socket_fd >= 0)
2632 dont_close[n_dont_close++] = socket_fd;
2633 if (exec_fd >= 0)
2634 dont_close[n_dont_close++] = exec_fd;
2635 if (n_fds > 0) {
2636 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2637 n_dont_close += n_fds;
2638 }
2639
2640 if (runtime)
2641 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2642
2643 if (dcreds) {
2644 if (dcreds->user)
2645 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2646 if (dcreds->group)
2647 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2648 }
2649
2650 if (user_lookup_fd >= 0)
2651 dont_close[n_dont_close++] = user_lookup_fd;
2652
2653 return close_all_fds(dont_close, n_dont_close);
2654 }
2655
2656 static int send_user_lookup(
2657 Unit *unit,
2658 int user_lookup_fd,
2659 uid_t uid,
2660 gid_t gid) {
2661
2662 assert(unit);
2663
2664 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2665 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2666 * specified. */
2667
2668 if (user_lookup_fd < 0)
2669 return 0;
2670
2671 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2672 return 0;
2673
2674 if (writev(user_lookup_fd,
2675 (struct iovec[]) {
2676 IOVEC_INIT(&uid, sizeof(uid)),
2677 IOVEC_INIT(&gid, sizeof(gid)),
2678 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2679 return -errno;
2680
2681 return 0;
2682 }
2683
2684 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2685 int r;
2686
2687 assert(c);
2688 assert(home);
2689 assert(buf);
2690
2691 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2692
2693 if (*home)
2694 return 0;
2695
2696 if (!c->working_directory_home)
2697 return 0;
2698
2699 if (uid == 0) {
2700 /* Hardcode /root as home directory for UID 0 */
2701 *home = "/root";
2702 return 1;
2703 }
2704
2705 r = get_home_dir(buf);
2706 if (r < 0)
2707 return r;
2708
2709 *home = *buf;
2710 return 1;
2711 }
2712
2713 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2714 _cleanup_strv_free_ char ** list = NULL;
2715 ExecDirectoryType t;
2716 int r;
2717
2718 assert(c);
2719 assert(p);
2720 assert(ret);
2721
2722 assert(c->dynamic_user);
2723
2724 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2725 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2726 * directories. */
2727
2728 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2729 char **i;
2730
2731 if (t == EXEC_DIRECTORY_CONFIGURATION)
2732 continue;
2733
2734 if (!p->prefix[t])
2735 continue;
2736
2737 STRV_FOREACH(i, c->directories[t].paths) {
2738 char *e;
2739
2740 if (t == EXEC_DIRECTORY_RUNTIME)
2741 e = strjoin(p->prefix[t], "/", *i);
2742 else
2743 e = strjoin(p->prefix[t], "/private/", *i);
2744 if (!e)
2745 return -ENOMEM;
2746
2747 r = strv_consume(&list, e);
2748 if (r < 0)
2749 return r;
2750 }
2751 }
2752
2753 *ret = TAKE_PTR(list);
2754
2755 return 0;
2756 }
2757
2758 static char *exec_command_line(char **argv);
2759
2760 static int exec_child(
2761 Unit *unit,
2762 const ExecCommand *command,
2763 const ExecContext *context,
2764 const ExecParameters *params,
2765 ExecRuntime *runtime,
2766 DynamicCreds *dcreds,
2767 int socket_fd,
2768 int named_iofds[3],
2769 int *fds,
2770 size_t n_socket_fds,
2771 size_t n_storage_fds,
2772 char **files_env,
2773 int user_lookup_fd,
2774 int *exit_status) {
2775
2776 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2777 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2778 _cleanup_free_ gid_t *supplementary_gids = NULL;
2779 const char *username = NULL, *groupname = NULL;
2780 _cleanup_free_ char *home_buffer = NULL;
2781 const char *home = NULL, *shell = NULL;
2782 dev_t journal_stream_dev = 0;
2783 ino_t journal_stream_ino = 0;
2784 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2785 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2786 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2787 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2788 #if HAVE_SELINUX
2789 _cleanup_free_ char *mac_selinux_context_net = NULL;
2790 bool use_selinux = false;
2791 #endif
2792 #if ENABLE_SMACK
2793 bool use_smack = false;
2794 #endif
2795 #if HAVE_APPARMOR
2796 bool use_apparmor = false;
2797 #endif
2798 uid_t uid = UID_INVALID;
2799 gid_t gid = GID_INVALID;
2800 size_t n_fds;
2801 ExecDirectoryType dt;
2802 int secure_bits;
2803
2804 assert(unit);
2805 assert(command);
2806 assert(context);
2807 assert(params);
2808 assert(exit_status);
2809
2810 rename_process_from_path(command->path);
2811
2812 /* We reset exactly these signals, since they are the
2813 * only ones we set to SIG_IGN in the main daemon. All
2814 * others we leave untouched because we set them to
2815 * SIG_DFL or a valid handler initially, both of which
2816 * will be demoted to SIG_DFL. */
2817 (void) default_signals(SIGNALS_CRASH_HANDLER,
2818 SIGNALS_IGNORE, -1);
2819
2820 if (context->ignore_sigpipe)
2821 (void) ignore_signals(SIGPIPE, -1);
2822
2823 r = reset_signal_mask();
2824 if (r < 0) {
2825 *exit_status = EXIT_SIGNAL_MASK;
2826 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2827 }
2828
2829 if (params->idle_pipe)
2830 do_idle_pipe_dance(params->idle_pipe);
2831
2832 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2833 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2834 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2835 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2836
2837 log_forget_fds();
2838 log_set_open_when_needed(true);
2839
2840 /* In case anything used libc syslog(), close this here, too */
2841 closelog();
2842
2843 n_fds = n_socket_fds + n_storage_fds;
2844 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2845 if (r < 0) {
2846 *exit_status = EXIT_FDS;
2847 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2848 }
2849
2850 if (!context->same_pgrp)
2851 if (setsid() < 0) {
2852 *exit_status = EXIT_SETSID;
2853 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2854 }
2855
2856 exec_context_tty_reset(context, params);
2857
2858 if (unit_shall_confirm_spawn(unit)) {
2859 const char *vc = params->confirm_spawn;
2860 _cleanup_free_ char *cmdline = NULL;
2861
2862 cmdline = exec_command_line(command->argv);
2863 if (!cmdline) {
2864 *exit_status = EXIT_MEMORY;
2865 return log_oom();
2866 }
2867
2868 r = ask_for_confirmation(vc, unit, cmdline);
2869 if (r != CONFIRM_EXECUTE) {
2870 if (r == CONFIRM_PRETEND_SUCCESS) {
2871 *exit_status = EXIT_SUCCESS;
2872 return 0;
2873 }
2874 *exit_status = EXIT_CONFIRM;
2875 log_unit_error(unit, "Execution cancelled by the user");
2876 return -ECANCELED;
2877 }
2878 }
2879
2880 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2881 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2882 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2883 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2884 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2885 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
2886 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
2887 *exit_status = EXIT_MEMORY;
2888 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2889 }
2890
2891 if (context->dynamic_user && dcreds) {
2892 _cleanup_strv_free_ char **suggested_paths = NULL;
2893
2894 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2895 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2896 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2897 *exit_status = EXIT_USER;
2898 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2899 }
2900
2901 r = compile_suggested_paths(context, params, &suggested_paths);
2902 if (r < 0) {
2903 *exit_status = EXIT_MEMORY;
2904 return log_oom();
2905 }
2906
2907 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2908 if (r < 0) {
2909 *exit_status = EXIT_USER;
2910 if (r == -EILSEQ) {
2911 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2912 return -EOPNOTSUPP;
2913 }
2914 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2915 }
2916
2917 if (!uid_is_valid(uid)) {
2918 *exit_status = EXIT_USER;
2919 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2920 return -ESRCH;
2921 }
2922
2923 if (!gid_is_valid(gid)) {
2924 *exit_status = EXIT_USER;
2925 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2926 return -ESRCH;
2927 }
2928
2929 if (dcreds->user)
2930 username = dcreds->user->name;
2931
2932 } else {
2933 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2934 if (r < 0) {
2935 *exit_status = EXIT_USER;
2936 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2937 }
2938
2939 r = get_fixed_group(context, &groupname, &gid);
2940 if (r < 0) {
2941 *exit_status = EXIT_GROUP;
2942 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2943 }
2944 }
2945
2946 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2947 r = get_supplementary_groups(context, username, groupname, gid,
2948 &supplementary_gids, &ngids);
2949 if (r < 0) {
2950 *exit_status = EXIT_GROUP;
2951 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2952 }
2953
2954 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2955 if (r < 0) {
2956 *exit_status = EXIT_USER;
2957 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2958 }
2959
2960 user_lookup_fd = safe_close(user_lookup_fd);
2961
2962 r = acquire_home(context, uid, &home, &home_buffer);
2963 if (r < 0) {
2964 *exit_status = EXIT_CHDIR;
2965 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2966 }
2967
2968 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2969 * must sure to drop O_NONBLOCK */
2970 if (socket_fd >= 0)
2971 (void) fd_nonblock(socket_fd, false);
2972
2973 r = setup_input(context, params, socket_fd, named_iofds);
2974 if (r < 0) {
2975 *exit_status = EXIT_STDIN;
2976 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2977 }
2978
2979 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2980 if (r < 0) {
2981 *exit_status = EXIT_STDOUT;
2982 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2983 }
2984
2985 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2986 if (r < 0) {
2987 *exit_status = EXIT_STDERR;
2988 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2989 }
2990
2991 if (params->cgroup_path) {
2992 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2993 if (r < 0) {
2994 *exit_status = EXIT_CGROUP;
2995 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2996 }
2997 }
2998
2999 if (context->oom_score_adjust_set) {
3000 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3001 * prohibit write access to this file, and we shouldn't trip up over that. */
3002 r = set_oom_score_adjust(context->oom_score_adjust);
3003 if (IN_SET(r, -EPERM, -EACCES))
3004 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3005 else if (r < 0) {
3006 *exit_status = EXIT_OOM_ADJUST;
3007 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3008 }
3009 }
3010
3011 if (context->nice_set)
3012 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3013 *exit_status = EXIT_NICE;
3014 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3015 }
3016
3017 if (context->cpu_sched_set) {
3018 struct sched_param param = {
3019 .sched_priority = context->cpu_sched_priority,
3020 };
3021
3022 r = sched_setscheduler(0,
3023 context->cpu_sched_policy |
3024 (context->cpu_sched_reset_on_fork ?
3025 SCHED_RESET_ON_FORK : 0),
3026 &param);
3027 if (r < 0) {
3028 *exit_status = EXIT_SETSCHEDULER;
3029 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3030 }
3031 }
3032
3033 if (context->cpuset)
3034 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
3035 *exit_status = EXIT_CPUAFFINITY;
3036 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3037 }
3038
3039 if (context->ioprio_set)
3040 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3041 *exit_status = EXIT_IOPRIO;
3042 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3043 }
3044
3045 if (context->timer_slack_nsec != NSEC_INFINITY)
3046 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3047 *exit_status = EXIT_TIMERSLACK;
3048 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3049 }
3050
3051 if (context->personality != PERSONALITY_INVALID) {
3052 r = safe_personality(context->personality);
3053 if (r < 0) {
3054 *exit_status = EXIT_PERSONALITY;
3055 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3056 }
3057 }
3058
3059 if (context->utmp_id)
3060 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3061 context->tty_path,
3062 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3063 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3064 USER_PROCESS,
3065 username);
3066
3067 if (context->user) {
3068 r = chown_terminal(STDIN_FILENO, uid);
3069 if (r < 0) {
3070 *exit_status = EXIT_STDIN;
3071 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3072 }
3073 }
3074
3075 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3076 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3077 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3078 * touch a single hierarchy too. */
3079 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3080 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3081 if (r < 0) {
3082 *exit_status = EXIT_CGROUP;
3083 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3084 }
3085 }
3086
3087 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3088 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3089 if (r < 0)
3090 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3091 }
3092
3093 r = build_environment(
3094 unit,
3095 context,
3096 params,
3097 n_fds,
3098 home,
3099 username,
3100 shell,
3101 journal_stream_dev,
3102 journal_stream_ino,
3103 &our_env);
3104 if (r < 0) {
3105 *exit_status = EXIT_MEMORY;
3106 return log_oom();
3107 }
3108
3109 r = build_pass_environment(context, &pass_env);
3110 if (r < 0) {
3111 *exit_status = EXIT_MEMORY;
3112 return log_oom();
3113 }
3114
3115 accum_env = strv_env_merge(5,
3116 params->environment,
3117 our_env,
3118 pass_env,
3119 context->environment,
3120 files_env,
3121 NULL);
3122 if (!accum_env) {
3123 *exit_status = EXIT_MEMORY;
3124 return log_oom();
3125 }
3126 accum_env = strv_env_clean(accum_env);
3127
3128 (void) umask(context->umask);
3129
3130 r = setup_keyring(unit, context, params, uid, gid);
3131 if (r < 0) {
3132 *exit_status = EXIT_KEYRING;
3133 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3134 }
3135
3136 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3137 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3138
3139 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3140 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3141
3142 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3143 if (needs_ambient_hack)
3144 needs_setuid = false;
3145 else
3146 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3147
3148 if (needs_sandboxing) {
3149 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3150 * present. The actual MAC context application will happen later, as late as possible, to avoid
3151 * impacting our own code paths. */
3152
3153 #if HAVE_SELINUX
3154 use_selinux = mac_selinux_use();
3155 #endif
3156 #if ENABLE_SMACK
3157 use_smack = mac_smack_use();
3158 #endif
3159 #if HAVE_APPARMOR
3160 use_apparmor = mac_apparmor_use();
3161 #endif
3162 }
3163
3164 if (needs_setuid) {
3165 if (context->pam_name && username) {
3166 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3167 if (r < 0) {
3168 *exit_status = EXIT_PAM;
3169 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3170 }
3171 }
3172 }
3173
3174 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3175 if (ns_type_supported(NAMESPACE_NET)) {
3176 r = setup_netns(runtime->netns_storage_socket);
3177 if (r < 0) {
3178 *exit_status = EXIT_NETWORK;
3179 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3180 }
3181 } else
3182 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3183 }
3184
3185 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3186 if (needs_mount_namespace) {
3187 r = apply_mount_namespace(unit, command, context, params, runtime);
3188 if (r < 0) {
3189 *exit_status = EXIT_NAMESPACE;
3190 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3191 }
3192 }
3193
3194 /* Apply just after mount namespace setup */
3195 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3196 if (r < 0)
3197 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3198
3199 /* Drop groups as early as possbile */
3200 if (needs_setuid) {
3201 r = enforce_groups(gid, supplementary_gids, ngids);
3202 if (r < 0) {
3203 *exit_status = EXIT_GROUP;
3204 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3205 }
3206 }
3207
3208 if (needs_sandboxing) {
3209 #if HAVE_SELINUX
3210 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3211 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3212 if (r < 0) {
3213 *exit_status = EXIT_SELINUX_CONTEXT;
3214 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3215 }
3216 }
3217 #endif
3218
3219 if (context->private_users) {
3220 r = setup_private_users(uid, gid);
3221 if (r < 0) {
3222 *exit_status = EXIT_USER;
3223 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3224 }
3225 }
3226 }
3227
3228 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3229 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3230 * however if we have it as we want to keep it open until the final execve(). */
3231
3232 if (params->exec_fd >= 0) {
3233 exec_fd = params->exec_fd;
3234
3235 if (exec_fd < 3 + (int) n_fds) {
3236 int moved_fd;
3237
3238 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3239 * process we are about to execute. */
3240
3241 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3242 if (moved_fd < 0) {
3243 *exit_status = EXIT_FDS;
3244 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3245 }
3246
3247 safe_close(exec_fd);
3248 exec_fd = moved_fd;
3249 } else {
3250 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3251 r = fd_cloexec(exec_fd, true);
3252 if (r < 0) {
3253 *exit_status = EXIT_FDS;
3254 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3255 }
3256 }
3257
3258 fds_with_exec_fd = newa(int, n_fds + 1);
3259 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3260 fds_with_exec_fd[n_fds] = exec_fd;
3261 n_fds_with_exec_fd = n_fds + 1;
3262 } else {
3263 fds_with_exec_fd = fds;
3264 n_fds_with_exec_fd = n_fds;
3265 }
3266
3267 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3268 if (r >= 0)
3269 r = shift_fds(fds, n_fds);
3270 if (r >= 0)
3271 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3272 if (r < 0) {
3273 *exit_status = EXIT_FDS;
3274 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3275 }
3276
3277 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3278 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3279 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3280 * came this far. */
3281
3282 secure_bits = context->secure_bits;
3283
3284 if (needs_sandboxing) {
3285 uint64_t bset;
3286 int which_failed;
3287
3288 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3289 if (r < 0) {
3290 *exit_status = EXIT_LIMITS;
3291 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3292 }
3293
3294 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3295 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3296 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3297 *exit_status = EXIT_LIMITS;
3298 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3299 }
3300 }
3301
3302 #if ENABLE_SMACK
3303 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3304 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3305 if (use_smack) {
3306 r = setup_smack(context, command);
3307 if (r < 0) {
3308 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3309 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3310 }
3311 }
3312 #endif
3313
3314 bset = context->capability_bounding_set;
3315 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3316 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3317 * instead of us doing that */
3318 if (needs_ambient_hack)
3319 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3320 (UINT64_C(1) << CAP_SETUID) |
3321 (UINT64_C(1) << CAP_SETGID);
3322
3323 if (!cap_test_all(bset)) {
3324 r = capability_bounding_set_drop(bset, false);
3325 if (r < 0) {
3326 *exit_status = EXIT_CAPABILITIES;
3327 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3328 }
3329 }
3330
3331 /* This is done before enforce_user, but ambient set
3332 * does not survive over setresuid() if keep_caps is not set. */
3333 if (!needs_ambient_hack &&
3334 context->capability_ambient_set != 0) {
3335 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3336 if (r < 0) {
3337 *exit_status = EXIT_CAPABILITIES;
3338 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3339 }
3340 }
3341 }
3342
3343 if (needs_setuid) {
3344 if (context->user) {
3345 r = enforce_user(context, uid);
3346 if (r < 0) {
3347 *exit_status = EXIT_USER;
3348 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3349 }
3350
3351 if (!needs_ambient_hack &&
3352 context->capability_ambient_set != 0) {
3353
3354 /* Fix the ambient capabilities after user change. */
3355 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3356 if (r < 0) {
3357 *exit_status = EXIT_CAPABILITIES;
3358 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3359 }
3360
3361 /* If we were asked to change user and ambient capabilities
3362 * were requested, we had to add keep-caps to the securebits
3363 * so that we would maintain the inherited capability set
3364 * through the setresuid(). Make sure that the bit is added
3365 * also to the context secure_bits so that we don't try to
3366 * drop the bit away next. */
3367
3368 secure_bits |= 1<<SECURE_KEEP_CAPS;
3369 }
3370 }
3371 }
3372
3373 if (needs_sandboxing) {
3374 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3375 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3376 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3377 * are restricted. */
3378
3379 #if HAVE_SELINUX
3380 if (use_selinux) {
3381 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3382
3383 if (exec_context) {
3384 r = setexeccon(exec_context);
3385 if (r < 0) {
3386 *exit_status = EXIT_SELINUX_CONTEXT;
3387 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3388 }
3389 }
3390 }
3391 #endif
3392
3393 #if HAVE_APPARMOR
3394 if (use_apparmor && context->apparmor_profile) {
3395 r = aa_change_onexec(context->apparmor_profile);
3396 if (r < 0 && !context->apparmor_profile_ignore) {
3397 *exit_status = EXIT_APPARMOR_PROFILE;
3398 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3399 }
3400 }
3401 #endif
3402
3403 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3404 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3405 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3406 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3407 *exit_status = EXIT_SECUREBITS;
3408 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3409 }
3410
3411 if (context_has_no_new_privileges(context))
3412 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3413 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3414 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3415 }
3416
3417 #if HAVE_SECCOMP
3418 r = apply_address_families(unit, context);
3419 if (r < 0) {
3420 *exit_status = EXIT_ADDRESS_FAMILIES;
3421 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3422 }
3423
3424 r = apply_memory_deny_write_execute(unit, context);
3425 if (r < 0) {
3426 *exit_status = EXIT_SECCOMP;
3427 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3428 }
3429
3430 r = apply_restrict_realtime(unit, context);
3431 if (r < 0) {
3432 *exit_status = EXIT_SECCOMP;
3433 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3434 }
3435
3436 r = apply_restrict_namespaces(unit, context);
3437 if (r < 0) {
3438 *exit_status = EXIT_SECCOMP;
3439 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3440 }
3441
3442 r = apply_protect_sysctl(unit, context);
3443 if (r < 0) {
3444 *exit_status = EXIT_SECCOMP;
3445 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3446 }
3447
3448 r = apply_protect_kernel_modules(unit, context);
3449 if (r < 0) {
3450 *exit_status = EXIT_SECCOMP;
3451 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3452 }
3453
3454 r = apply_private_devices(unit, context);
3455 if (r < 0) {
3456 *exit_status = EXIT_SECCOMP;
3457 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3458 }
3459
3460 r = apply_syscall_archs(unit, context);
3461 if (r < 0) {
3462 *exit_status = EXIT_SECCOMP;
3463 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3464 }
3465
3466 r = apply_lock_personality(unit, context);
3467 if (r < 0) {
3468 *exit_status = EXIT_SECCOMP;
3469 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3470 }
3471
3472 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3473 * by the filter as little as possible. */
3474 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3475 if (r < 0) {
3476 *exit_status = EXIT_SECCOMP;
3477 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3478 }
3479 #endif
3480 }
3481
3482 if (!strv_isempty(context->unset_environment)) {
3483 char **ee = NULL;
3484
3485 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3486 if (!ee) {
3487 *exit_status = EXIT_MEMORY;
3488 return log_oom();
3489 }
3490
3491 strv_free_and_replace(accum_env, ee);
3492 }
3493
3494 final_argv = replace_env_argv(command->argv, accum_env);
3495 if (!final_argv) {
3496 *exit_status = EXIT_MEMORY;
3497 return log_oom();
3498 }
3499
3500 if (DEBUG_LOGGING) {
3501 _cleanup_free_ char *line;
3502
3503 line = exec_command_line(final_argv);
3504 if (line)
3505 log_struct(LOG_DEBUG,
3506 "EXECUTABLE=%s", command->path,
3507 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3508 LOG_UNIT_ID(unit),
3509 LOG_UNIT_INVOCATION_ID(unit));
3510 }
3511
3512 if (exec_fd >= 0) {
3513 uint8_t hot = 1;
3514
3515 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3516 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3517
3518 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3519 *exit_status = EXIT_EXEC;
3520 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3521 }
3522 }
3523
3524 execve(command->path, final_argv, accum_env);
3525 r = -errno;
3526
3527 if (exec_fd >= 0) {
3528 uint8_t hot = 0;
3529
3530 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3531 * that POLLHUP on it no longer means execve() succeeded. */
3532
3533 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3534 *exit_status = EXIT_EXEC;
3535 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3536 }
3537 }
3538
3539 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3540 log_struct_errno(LOG_INFO, r,
3541 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3542 LOG_UNIT_ID(unit),
3543 LOG_UNIT_INVOCATION_ID(unit),
3544 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3545 command->path),
3546 "EXECUTABLE=%s", command->path);
3547 return 0;
3548 }
3549
3550 *exit_status = EXIT_EXEC;
3551 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3552 }
3553
3554 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3555 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3556
3557 int exec_spawn(Unit *unit,
3558 ExecCommand *command,
3559 const ExecContext *context,
3560 const ExecParameters *params,
3561 ExecRuntime *runtime,
3562 DynamicCreds *dcreds,
3563 pid_t *ret) {
3564
3565 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3566 _cleanup_strv_free_ char **files_env = NULL;
3567 size_t n_storage_fds = 0, n_socket_fds = 0;
3568 _cleanup_free_ char *line = NULL;
3569 pid_t pid;
3570
3571 assert(unit);
3572 assert(command);
3573 assert(context);
3574 assert(ret);
3575 assert(params);
3576 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3577
3578 if (context->std_input == EXEC_INPUT_SOCKET ||
3579 context->std_output == EXEC_OUTPUT_SOCKET ||
3580 context->std_error == EXEC_OUTPUT_SOCKET) {
3581
3582 if (params->n_socket_fds > 1) {
3583 log_unit_error(unit, "Got more than one socket.");
3584 return -EINVAL;
3585 }
3586
3587 if (params->n_socket_fds == 0) {
3588 log_unit_error(unit, "Got no socket.");
3589 return -EINVAL;
3590 }
3591
3592 socket_fd = params->fds[0];
3593 } else {
3594 socket_fd = -1;
3595 fds = params->fds;
3596 n_socket_fds = params->n_socket_fds;
3597 n_storage_fds = params->n_storage_fds;
3598 }
3599
3600 r = exec_context_named_iofds(context, params, named_iofds);
3601 if (r < 0)
3602 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3603
3604 r = exec_context_load_environment(unit, context, &files_env);
3605 if (r < 0)
3606 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3607
3608 line = exec_command_line(command->argv);
3609 if (!line)
3610 return log_oom();
3611
3612 log_struct(LOG_DEBUG,
3613 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3614 "EXECUTABLE=%s", command->path,
3615 LOG_UNIT_ID(unit),
3616 LOG_UNIT_INVOCATION_ID(unit));
3617
3618 pid = fork();
3619 if (pid < 0)
3620 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3621
3622 if (pid == 0) {
3623 int exit_status = EXIT_SUCCESS;
3624
3625 r = exec_child(unit,
3626 command,
3627 context,
3628 params,
3629 runtime,
3630 dcreds,
3631 socket_fd,
3632 named_iofds,
3633 fds,
3634 n_socket_fds,
3635 n_storage_fds,
3636 files_env,
3637 unit->manager->user_lookup_fds[1],
3638 &exit_status);
3639
3640 if (r < 0)
3641 log_struct_errno(LOG_ERR, r,
3642 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3643 LOG_UNIT_ID(unit),
3644 LOG_UNIT_INVOCATION_ID(unit),
3645 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3646 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3647 command->path),
3648 "EXECUTABLE=%s", command->path);
3649
3650 _exit(exit_status);
3651 }
3652
3653 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3654
3655 /* We add the new process to the cgroup both in the child (so
3656 * that we can be sure that no user code is ever executed
3657 * outside of the cgroup) and in the parent (so that we can be
3658 * sure that when we kill the cgroup the process will be
3659 * killed too). */
3660 if (params->cgroup_path)
3661 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3662
3663 exec_status_start(&command->exec_status, pid);
3664
3665 *ret = pid;
3666 return 0;
3667 }
3668
3669 void exec_context_init(ExecContext *c) {
3670 ExecDirectoryType i;
3671
3672 assert(c);
3673
3674 c->umask = 0022;
3675 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3676 c->cpu_sched_policy = SCHED_OTHER;
3677 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3678 c->syslog_level_prefix = true;
3679 c->ignore_sigpipe = true;
3680 c->timer_slack_nsec = NSEC_INFINITY;
3681 c->personality = PERSONALITY_INVALID;
3682 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3683 c->directories[i].mode = 0755;
3684 c->capability_bounding_set = CAP_ALL;
3685 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3686 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3687 c->log_level_max = -1;
3688 }
3689
3690 void exec_context_done(ExecContext *c) {
3691 ExecDirectoryType i;
3692 size_t l;
3693
3694 assert(c);
3695
3696 c->environment = strv_free(c->environment);
3697 c->environment_files = strv_free(c->environment_files);
3698 c->pass_environment = strv_free(c->pass_environment);
3699 c->unset_environment = strv_free(c->unset_environment);
3700
3701 rlimit_free_all(c->rlimit);
3702
3703 for (l = 0; l < 3; l++) {
3704 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3705 c->stdio_file[l] = mfree(c->stdio_file[l]);
3706 }
3707
3708 c->working_directory = mfree(c->working_directory);
3709 c->root_directory = mfree(c->root_directory);
3710 c->root_image = mfree(c->root_image);
3711 c->tty_path = mfree(c->tty_path);
3712 c->syslog_identifier = mfree(c->syslog_identifier);
3713 c->user = mfree(c->user);
3714 c->group = mfree(c->group);
3715
3716 c->supplementary_groups = strv_free(c->supplementary_groups);
3717
3718 c->pam_name = mfree(c->pam_name);
3719
3720 c->read_only_paths = strv_free(c->read_only_paths);
3721 c->read_write_paths = strv_free(c->read_write_paths);
3722 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3723
3724 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3725 c->bind_mounts = NULL;
3726 c->n_bind_mounts = 0;
3727 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3728 c->temporary_filesystems = NULL;
3729 c->n_temporary_filesystems = 0;
3730
3731 c->cpuset = cpu_set_mfree(c->cpuset);
3732
3733 c->utmp_id = mfree(c->utmp_id);
3734 c->selinux_context = mfree(c->selinux_context);
3735 c->apparmor_profile = mfree(c->apparmor_profile);
3736 c->smack_process_label = mfree(c->smack_process_label);
3737
3738 c->syscall_filter = hashmap_free(c->syscall_filter);
3739 c->syscall_archs = set_free(c->syscall_archs);
3740 c->address_families = set_free(c->address_families);
3741
3742 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3743 c->directories[i].paths = strv_free(c->directories[i].paths);
3744
3745 c->log_level_max = -1;
3746
3747 exec_context_free_log_extra_fields(c);
3748
3749 c->stdin_data = mfree(c->stdin_data);
3750 c->stdin_data_size = 0;
3751 }
3752
3753 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3754 char **i;
3755
3756 assert(c);
3757
3758 if (!runtime_prefix)
3759 return 0;
3760
3761 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3762 _cleanup_free_ char *p;
3763
3764 p = strjoin(runtime_prefix, "/", *i);
3765 if (!p)
3766 return -ENOMEM;
3767
3768 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3769 * next. */
3770 (void) rm_rf(p, REMOVE_ROOT);
3771 }
3772
3773 return 0;
3774 }
3775
3776 static void exec_command_done(ExecCommand *c) {
3777 assert(c);
3778
3779 c->path = mfree(c->path);
3780 c->argv = strv_free(c->argv);
3781 }
3782
3783 void exec_command_done_array(ExecCommand *c, size_t n) {
3784 size_t i;
3785
3786 for (i = 0; i < n; i++)
3787 exec_command_done(c+i);
3788 }
3789
3790 ExecCommand* exec_command_free_list(ExecCommand *c) {
3791 ExecCommand *i;
3792
3793 while ((i = c)) {
3794 LIST_REMOVE(command, c, i);
3795 exec_command_done(i);
3796 free(i);
3797 }
3798
3799 return NULL;
3800 }
3801
3802 void exec_command_free_array(ExecCommand **c, size_t n) {
3803 size_t i;
3804
3805 for (i = 0; i < n; i++)
3806 c[i] = exec_command_free_list(c[i]);
3807 }
3808
3809 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
3810 size_t i;
3811
3812 for (i = 0; i < n; i++)
3813 exec_status_reset(&c[i].exec_status);
3814 }
3815
3816 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
3817 size_t i;
3818
3819 for (i = 0; i < n; i++) {
3820 ExecCommand *z;
3821
3822 LIST_FOREACH(command, z, c[i])
3823 exec_status_reset(&z->exec_status);
3824 }
3825 }
3826
3827 typedef struct InvalidEnvInfo {
3828 const Unit *unit;
3829 const char *path;
3830 } InvalidEnvInfo;
3831
3832 static void invalid_env(const char *p, void *userdata) {
3833 InvalidEnvInfo *info = userdata;
3834
3835 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3836 }
3837
3838 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3839 assert(c);
3840
3841 switch (fd_index) {
3842
3843 case STDIN_FILENO:
3844 if (c->std_input != EXEC_INPUT_NAMED_FD)
3845 return NULL;
3846
3847 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3848
3849 case STDOUT_FILENO:
3850 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3851 return NULL;
3852
3853 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3854
3855 case STDERR_FILENO:
3856 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3857 return NULL;
3858
3859 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3860
3861 default:
3862 return NULL;
3863 }
3864 }
3865
3866 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3867 size_t i, targets;
3868 const char* stdio_fdname[3];
3869 size_t n_fds;
3870
3871 assert(c);
3872 assert(p);
3873
3874 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3875 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3876 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3877
3878 for (i = 0; i < 3; i++)
3879 stdio_fdname[i] = exec_context_fdname(c, i);
3880
3881 n_fds = p->n_storage_fds + p->n_socket_fds;
3882
3883 for (i = 0; i < n_fds && targets > 0; i++)
3884 if (named_iofds[STDIN_FILENO] < 0 &&
3885 c->std_input == EXEC_INPUT_NAMED_FD &&
3886 stdio_fdname[STDIN_FILENO] &&
3887 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3888
3889 named_iofds[STDIN_FILENO] = p->fds[i];
3890 targets--;
3891
3892 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3893 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3894 stdio_fdname[STDOUT_FILENO] &&
3895 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3896
3897 named_iofds[STDOUT_FILENO] = p->fds[i];
3898 targets--;
3899
3900 } else if (named_iofds[STDERR_FILENO] < 0 &&
3901 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3902 stdio_fdname[STDERR_FILENO] &&
3903 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3904
3905 named_iofds[STDERR_FILENO] = p->fds[i];
3906 targets--;
3907 }
3908
3909 return targets == 0 ? 0 : -ENOENT;
3910 }
3911
3912 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3913 char **i, **r = NULL;
3914
3915 assert(c);
3916 assert(l);
3917
3918 STRV_FOREACH(i, c->environment_files) {
3919 char *fn;
3920 int k;
3921 unsigned n;
3922 bool ignore = false;
3923 char **p;
3924 _cleanup_globfree_ glob_t pglob = {};
3925
3926 fn = *i;
3927
3928 if (fn[0] == '-') {
3929 ignore = true;
3930 fn++;
3931 }
3932
3933 if (!path_is_absolute(fn)) {
3934 if (ignore)
3935 continue;
3936
3937 strv_free(r);
3938 return -EINVAL;
3939 }
3940
3941 /* Filename supports globbing, take all matching files */
3942 k = safe_glob(fn, 0, &pglob);
3943 if (k < 0) {
3944 if (ignore)
3945 continue;
3946
3947 strv_free(r);
3948 return k;
3949 }
3950
3951 /* When we don't match anything, -ENOENT should be returned */
3952 assert(pglob.gl_pathc > 0);
3953
3954 for (n = 0; n < pglob.gl_pathc; n++) {
3955 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3956 if (k < 0) {
3957 if (ignore)
3958 continue;
3959
3960 strv_free(r);
3961 return k;
3962 }
3963 /* Log invalid environment variables with filename */
3964 if (p) {
3965 InvalidEnvInfo info = {
3966 .unit = unit,
3967 .path = pglob.gl_pathv[n]
3968 };
3969
3970 p = strv_env_clean_with_callback(p, invalid_env, &info);
3971 }
3972
3973 if (!r)
3974 r = p;
3975 else {
3976 char **m;
3977
3978 m = strv_env_merge(2, r, p);
3979 strv_free(r);
3980 strv_free(p);
3981 if (!m)
3982 return -ENOMEM;
3983
3984 r = m;
3985 }
3986 }
3987 }
3988
3989 *l = r;
3990
3991 return 0;
3992 }
3993
3994 static bool tty_may_match_dev_console(const char *tty) {
3995 _cleanup_free_ char *resolved = NULL;
3996
3997 if (!tty)
3998 return true;
3999
4000 tty = skip_dev_prefix(tty);
4001
4002 /* trivial identity? */
4003 if (streq(tty, "console"))
4004 return true;
4005
4006 if (resolve_dev_console(&resolved) < 0)
4007 return true; /* if we could not resolve, assume it may */
4008
4009 /* "tty0" means the active VC, so it may be the same sometimes */
4010 return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4011 }
4012
4013 bool exec_context_may_touch_console(const ExecContext *ec) {
4014
4015 return (ec->tty_reset ||
4016 ec->tty_vhangup ||
4017 ec->tty_vt_disallocate ||
4018 is_terminal_input(ec->std_input) ||
4019 is_terminal_output(ec->std_output) ||
4020 is_terminal_output(ec->std_error)) &&
4021 tty_may_match_dev_console(exec_context_tty_path(ec));
4022 }
4023
4024 static void strv_fprintf(FILE *f, char **l) {
4025 char **g;
4026
4027 assert(f);
4028
4029 STRV_FOREACH(g, l)
4030 fprintf(f, " %s", *g);
4031 }
4032
4033 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4034 ExecDirectoryType dt;
4035 char **e, **d;
4036 unsigned i;
4037 int r;
4038
4039 assert(c);
4040 assert(f);
4041
4042 prefix = strempty(prefix);
4043
4044 fprintf(f,
4045 "%sUMask: %04o\n"
4046 "%sWorkingDirectory: %s\n"
4047 "%sRootDirectory: %s\n"
4048 "%sNonBlocking: %s\n"
4049 "%sPrivateTmp: %s\n"
4050 "%sPrivateDevices: %s\n"
4051 "%sProtectKernelTunables: %s\n"
4052 "%sProtectKernelModules: %s\n"
4053 "%sProtectControlGroups: %s\n"
4054 "%sPrivateNetwork: %s\n"
4055 "%sPrivateUsers: %s\n"
4056 "%sProtectHome: %s\n"
4057 "%sProtectSystem: %s\n"
4058 "%sMountAPIVFS: %s\n"
4059 "%sIgnoreSIGPIPE: %s\n"
4060 "%sMemoryDenyWriteExecute: %s\n"
4061 "%sRestrictRealtime: %s\n"
4062 "%sKeyringMode: %s\n",
4063 prefix, c->umask,
4064 prefix, c->working_directory ? c->working_directory : "/",
4065 prefix, c->root_directory ? c->root_directory : "/",
4066 prefix, yes_no(c->non_blocking),
4067 prefix, yes_no(c->private_tmp),
4068 prefix, yes_no(c->private_devices),
4069 prefix, yes_no(c->protect_kernel_tunables),
4070 prefix, yes_no(c->protect_kernel_modules),
4071 prefix, yes_no(c->protect_control_groups),
4072 prefix, yes_no(c->private_network),
4073 prefix, yes_no(c->private_users),
4074 prefix, protect_home_to_string(c->protect_home),
4075 prefix, protect_system_to_string(c->protect_system),
4076 prefix, yes_no(c->mount_apivfs),
4077 prefix, yes_no(c->ignore_sigpipe),
4078 prefix, yes_no(c->memory_deny_write_execute),
4079 prefix, yes_no(c->restrict_realtime),
4080 prefix, exec_keyring_mode_to_string(c->keyring_mode));
4081
4082 if (c->root_image)
4083 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4084
4085 STRV_FOREACH(e, c->environment)
4086 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4087
4088 STRV_FOREACH(e, c->environment_files)
4089 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4090
4091 STRV_FOREACH(e, c->pass_environment)
4092 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4093
4094 STRV_FOREACH(e, c->unset_environment)
4095 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4096
4097 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4098
4099 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4100 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4101
4102 STRV_FOREACH(d, c->directories[dt].paths)
4103 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4104 }
4105
4106 if (c->nice_set)
4107 fprintf(f,
4108 "%sNice: %i\n",
4109 prefix, c->nice);
4110
4111 if (c->oom_score_adjust_set)
4112 fprintf(f,
4113 "%sOOMScoreAdjust: %i\n",
4114 prefix, c->oom_score_adjust);
4115
4116 for (i = 0; i < RLIM_NLIMITS; i++)
4117 if (c->rlimit[i]) {
4118 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4119 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4120 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4121 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4122 }
4123
4124 if (c->ioprio_set) {
4125 _cleanup_free_ char *class_str = NULL;
4126
4127 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4128 if (r >= 0)
4129 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4130
4131 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4132 }
4133
4134 if (c->cpu_sched_set) {
4135 _cleanup_free_ char *policy_str = NULL;
4136
4137 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4138 if (r >= 0)
4139 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4140
4141 fprintf(f,
4142 "%sCPUSchedulingPriority: %i\n"
4143 "%sCPUSchedulingResetOnFork: %s\n",
4144 prefix, c->cpu_sched_priority,
4145 prefix, yes_no(c->cpu_sched_reset_on_fork));
4146 }
4147
4148 if (c->cpuset) {
4149 fprintf(f, "%sCPUAffinity:", prefix);
4150 for (i = 0; i < c->cpuset_ncpus; i++)
4151 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4152 fprintf(f, " %u", i);
4153 fputs("\n", f);
4154 }
4155
4156 if (c->timer_slack_nsec != NSEC_INFINITY)
4157 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4158
4159 fprintf(f,
4160 "%sStandardInput: %s\n"
4161 "%sStandardOutput: %s\n"
4162 "%sStandardError: %s\n",
4163 prefix, exec_input_to_string(c->std_input),
4164 prefix, exec_output_to_string(c->std_output),
4165 prefix, exec_output_to_string(c->std_error));
4166
4167 if (c->std_input == EXEC_INPUT_NAMED_FD)
4168 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4169 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4170 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4171 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4172 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4173
4174 if (c->std_input == EXEC_INPUT_FILE)
4175 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4176 if (c->std_output == EXEC_OUTPUT_FILE)
4177 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4178 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4179 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4180 if (c->std_error == EXEC_OUTPUT_FILE)
4181 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4182 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4183 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4184
4185 if (c->tty_path)
4186 fprintf(f,
4187 "%sTTYPath: %s\n"
4188 "%sTTYReset: %s\n"
4189 "%sTTYVHangup: %s\n"
4190 "%sTTYVTDisallocate: %s\n",
4191 prefix, c->tty_path,
4192 prefix, yes_no(c->tty_reset),
4193 prefix, yes_no(c->tty_vhangup),
4194 prefix, yes_no(c->tty_vt_disallocate));
4195
4196 if (IN_SET(c->std_output,
4197 EXEC_OUTPUT_SYSLOG,
4198 EXEC_OUTPUT_KMSG,
4199 EXEC_OUTPUT_JOURNAL,
4200 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4201 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4202 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4203 IN_SET(c->std_error,
4204 EXEC_OUTPUT_SYSLOG,
4205 EXEC_OUTPUT_KMSG,
4206 EXEC_OUTPUT_JOURNAL,
4207 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4208 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4209 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4210
4211 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4212
4213 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4214 if (r >= 0)
4215 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4216
4217 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4218 if (r >= 0)
4219 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4220 }
4221
4222 if (c->log_level_max >= 0) {
4223 _cleanup_free_ char *t = NULL;
4224
4225 (void) log_level_to_string_alloc(c->log_level_max, &t);
4226
4227 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4228 }
4229
4230 if (c->n_log_extra_fields > 0) {
4231 size_t j;
4232
4233 for (j = 0; j < c->n_log_extra_fields; j++) {
4234 fprintf(f, "%sLogExtraFields: ", prefix);
4235 fwrite(c->log_extra_fields[j].iov_base,
4236 1, c->log_extra_fields[j].iov_len,
4237 f);
4238 fputc('\n', f);
4239 }
4240 }
4241
4242 if (c->secure_bits) {
4243 _cleanup_free_ char *str = NULL;
4244
4245 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4246 if (r >= 0)
4247 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4248 }
4249
4250 if (c->capability_bounding_set != CAP_ALL) {
4251 _cleanup_free_ char *str = NULL;
4252
4253 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4254 if (r >= 0)
4255 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4256 }
4257
4258 if (c->capability_ambient_set != 0) {
4259 _cleanup_free_ char *str = NULL;
4260
4261 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4262 if (r >= 0)
4263 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4264 }
4265
4266 if (c->user)
4267 fprintf(f, "%sUser: %s\n", prefix, c->user);
4268 if (c->group)
4269 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4270
4271 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4272
4273 if (!strv_isempty(c->supplementary_groups)) {
4274 fprintf(f, "%sSupplementaryGroups:", prefix);
4275 strv_fprintf(f, c->supplementary_groups);
4276 fputs("\n", f);
4277 }
4278
4279 if (c->pam_name)
4280 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4281
4282 if (!strv_isempty(c->read_write_paths)) {
4283 fprintf(f, "%sReadWritePaths:", prefix);
4284 strv_fprintf(f, c->read_write_paths);
4285 fputs("\n", f);
4286 }
4287
4288 if (!strv_isempty(c->read_only_paths)) {
4289 fprintf(f, "%sReadOnlyPaths:", prefix);
4290 strv_fprintf(f, c->read_only_paths);
4291 fputs("\n", f);
4292 }
4293
4294 if (!strv_isempty(c->inaccessible_paths)) {
4295 fprintf(f, "%sInaccessiblePaths:", prefix);
4296 strv_fprintf(f, c->inaccessible_paths);
4297 fputs("\n", f);
4298 }
4299
4300 if (c->n_bind_mounts > 0)
4301 for (i = 0; i < c->n_bind_mounts; i++)
4302 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4303 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4304 c->bind_mounts[i].ignore_enoent ? "-": "",
4305 c->bind_mounts[i].source,
4306 c->bind_mounts[i].destination,
4307 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4308
4309 if (c->n_temporary_filesystems > 0)
4310 for (i = 0; i < c->n_temporary_filesystems; i++) {
4311 TemporaryFileSystem *t = c->temporary_filesystems + i;
4312
4313 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4314 t->path,
4315 isempty(t->options) ? "" : ":",
4316 strempty(t->options));
4317 }
4318
4319 if (c->utmp_id)
4320 fprintf(f,
4321 "%sUtmpIdentifier: %s\n",
4322 prefix, c->utmp_id);
4323
4324 if (c->selinux_context)
4325 fprintf(f,
4326 "%sSELinuxContext: %s%s\n",
4327 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4328
4329 if (c->apparmor_profile)
4330 fprintf(f,
4331 "%sAppArmorProfile: %s%s\n",
4332 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4333
4334 if (c->smack_process_label)
4335 fprintf(f,
4336 "%sSmackProcessLabel: %s%s\n",
4337 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4338
4339 if (c->personality != PERSONALITY_INVALID)
4340 fprintf(f,
4341 "%sPersonality: %s\n",
4342 prefix, strna(personality_to_string(c->personality)));
4343
4344 fprintf(f,
4345 "%sLockPersonality: %s\n",
4346 prefix, yes_no(c->lock_personality));
4347
4348 if (c->syscall_filter) {
4349 #if HAVE_SECCOMP
4350 Iterator j;
4351 void *id, *val;
4352 bool first = true;
4353 #endif
4354
4355 fprintf(f,
4356 "%sSystemCallFilter: ",
4357 prefix);
4358
4359 if (!c->syscall_whitelist)
4360 fputc('~', f);
4361
4362 #if HAVE_SECCOMP
4363 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4364 _cleanup_free_ char *name = NULL;
4365 const char *errno_name = NULL;
4366 int num = PTR_TO_INT(val);
4367
4368 if (first)
4369 first = false;
4370 else
4371 fputc(' ', f);
4372
4373 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4374 fputs(strna(name), f);
4375
4376 if (num >= 0) {
4377 errno_name = errno_to_name(num);
4378 if (errno_name)
4379 fprintf(f, ":%s", errno_name);
4380 else
4381 fprintf(f, ":%d", num);
4382 }
4383 }
4384 #endif
4385
4386 fputc('\n', f);
4387 }
4388
4389 if (c->syscall_archs) {
4390 #if HAVE_SECCOMP
4391 Iterator j;
4392 void *id;
4393 #endif
4394
4395 fprintf(f,
4396 "%sSystemCallArchitectures:",
4397 prefix);
4398
4399 #if HAVE_SECCOMP
4400 SET_FOREACH(id, c->syscall_archs, j)
4401 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4402 #endif
4403 fputc('\n', f);
4404 }
4405
4406 if (exec_context_restrict_namespaces_set(c)) {
4407 _cleanup_free_ char *s = NULL;
4408
4409 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4410 if (r >= 0)
4411 fprintf(f, "%sRestrictNamespaces: %s\n",
4412 prefix, s);
4413 }
4414
4415 if (c->syscall_errno > 0) {
4416 const char *errno_name;
4417
4418 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4419
4420 errno_name = errno_to_name(c->syscall_errno);
4421 if (errno_name)
4422 fprintf(f, "%s\n", errno_name);
4423 else
4424 fprintf(f, "%d\n", c->syscall_errno);
4425 }
4426
4427 if (c->apparmor_profile)
4428 fprintf(f,
4429 "%sAppArmorProfile: %s%s\n",
4430 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4431 }
4432
4433 bool exec_context_maintains_privileges(const ExecContext *c) {
4434 assert(c);
4435
4436 /* Returns true if the process forked off would run under
4437 * an unchanged UID or as root. */
4438
4439 if (!c->user)
4440 return true;
4441
4442 if (streq(c->user, "root") || streq(c->user, "0"))
4443 return true;
4444
4445 return false;
4446 }
4447
4448 int exec_context_get_effective_ioprio(const ExecContext *c) {
4449 int p;
4450
4451 assert(c);
4452
4453 if (c->ioprio_set)
4454 return c->ioprio;
4455
4456 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4457 if (p < 0)
4458 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4459
4460 return p;
4461 }
4462
4463 void exec_context_free_log_extra_fields(ExecContext *c) {
4464 size_t l;
4465
4466 assert(c);
4467
4468 for (l = 0; l < c->n_log_extra_fields; l++)
4469 free(c->log_extra_fields[l].iov_base);
4470 c->log_extra_fields = mfree(c->log_extra_fields);
4471 c->n_log_extra_fields = 0;
4472 }
4473
4474 void exec_status_start(ExecStatus *s, pid_t pid) {
4475 assert(s);
4476
4477 *s = (ExecStatus) {
4478 .pid = pid,
4479 };
4480
4481 dual_timestamp_get(&s->start_timestamp);
4482 }
4483
4484 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4485 assert(s);
4486
4487 if (s->pid != pid) {
4488 *s = (ExecStatus) {
4489 .pid = pid,
4490 };
4491 }
4492
4493 dual_timestamp_get(&s->exit_timestamp);
4494
4495 s->code = code;
4496 s->status = status;
4497
4498 if (context) {
4499 if (context->utmp_id)
4500 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4501
4502 exec_context_tty_reset(context, NULL);
4503 }
4504 }
4505
4506 void exec_status_reset(ExecStatus *s) {
4507 assert(s);
4508
4509 *s = (ExecStatus) {};
4510 }
4511
4512 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4513 char buf[FORMAT_TIMESTAMP_MAX];
4514
4515 assert(s);
4516 assert(f);
4517
4518 if (s->pid <= 0)
4519 return;
4520
4521 prefix = strempty(prefix);
4522
4523 fprintf(f,
4524 "%sPID: "PID_FMT"\n",
4525 prefix, s->pid);
4526
4527 if (dual_timestamp_is_set(&s->start_timestamp))
4528 fprintf(f,
4529 "%sStart Timestamp: %s\n",
4530 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4531
4532 if (dual_timestamp_is_set(&s->exit_timestamp))
4533 fprintf(f,
4534 "%sExit Timestamp: %s\n"
4535 "%sExit Code: %s\n"
4536 "%sExit Status: %i\n",
4537 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4538 prefix, sigchld_code_to_string(s->code),
4539 prefix, s->status);
4540 }
4541
4542 static char *exec_command_line(char **argv) {
4543 size_t k;
4544 char *n, *p, **a;
4545 bool first = true;
4546
4547 assert(argv);
4548
4549 k = 1;
4550 STRV_FOREACH(a, argv)
4551 k += strlen(*a)+3;
4552
4553 n = new(char, k);
4554 if (!n)
4555 return NULL;
4556
4557 p = n;
4558 STRV_FOREACH(a, argv) {
4559
4560 if (!first)
4561 *(p++) = ' ';
4562 else
4563 first = false;
4564
4565 if (strpbrk(*a, WHITESPACE)) {
4566 *(p++) = '\'';
4567 p = stpcpy(p, *a);
4568 *(p++) = '\'';
4569 } else
4570 p = stpcpy(p, *a);
4571
4572 }
4573
4574 *p = 0;
4575
4576 /* FIXME: this doesn't really handle arguments that have
4577 * spaces and ticks in them */
4578
4579 return n;
4580 }
4581
4582 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4583 _cleanup_free_ char *cmd = NULL;
4584 const char *prefix2;
4585
4586 assert(c);
4587 assert(f);
4588
4589 prefix = strempty(prefix);
4590 prefix2 = strjoina(prefix, "\t");
4591
4592 cmd = exec_command_line(c->argv);
4593 fprintf(f,
4594 "%sCommand Line: %s\n",
4595 prefix, cmd ? cmd : strerror(ENOMEM));
4596
4597 exec_status_dump(&c->exec_status, f, prefix2);
4598 }
4599
4600 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4601 assert(f);
4602
4603 prefix = strempty(prefix);
4604
4605 LIST_FOREACH(command, c, c)
4606 exec_command_dump(c, f, prefix);
4607 }
4608
4609 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4610 ExecCommand *end;
4611
4612 assert(l);
4613 assert(e);
4614
4615 if (*l) {
4616 /* It's kind of important, that we keep the order here */
4617 LIST_FIND_TAIL(command, *l, end);
4618 LIST_INSERT_AFTER(command, *l, end, e);
4619 } else
4620 *l = e;
4621 }
4622
4623 int exec_command_set(ExecCommand *c, const char *path, ...) {
4624 va_list ap;
4625 char **l, *p;
4626
4627 assert(c);
4628 assert(path);
4629
4630 va_start(ap, path);
4631 l = strv_new_ap(path, ap);
4632 va_end(ap);
4633
4634 if (!l)
4635 return -ENOMEM;
4636
4637 p = strdup(path);
4638 if (!p) {
4639 strv_free(l);
4640 return -ENOMEM;
4641 }
4642
4643 free(c->path);
4644 c->path = p;
4645
4646 return strv_free_and_replace(c->argv, l);
4647 }
4648
4649 int exec_command_append(ExecCommand *c, const char *path, ...) {
4650 _cleanup_strv_free_ char **l = NULL;
4651 va_list ap;
4652 int r;
4653
4654 assert(c);
4655 assert(path);
4656
4657 va_start(ap, path);
4658 l = strv_new_ap(path, ap);
4659 va_end(ap);
4660
4661 if (!l)
4662 return -ENOMEM;
4663
4664 r = strv_extend_strv(&c->argv, l, false);
4665 if (r < 0)
4666 return r;
4667
4668 return 0;
4669 }
4670
4671 static void *remove_tmpdir_thread(void *p) {
4672 _cleanup_free_ char *path = p;
4673
4674 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4675 return NULL;
4676 }
4677
4678 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4679 int r;
4680
4681 if (!rt)
4682 return NULL;
4683
4684 if (rt->manager)
4685 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4686
4687 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4688 if (destroy && rt->tmp_dir) {
4689 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4690
4691 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4692 if (r < 0) {
4693 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4694 free(rt->tmp_dir);
4695 }
4696
4697 rt->tmp_dir = NULL;
4698 }
4699
4700 if (destroy && rt->var_tmp_dir) {
4701 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4702
4703 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4704 if (r < 0) {
4705 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4706 free(rt->var_tmp_dir);
4707 }
4708
4709 rt->var_tmp_dir = NULL;
4710 }
4711
4712 rt->id = mfree(rt->id);
4713 rt->tmp_dir = mfree(rt->tmp_dir);
4714 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4715 safe_close_pair(rt->netns_storage_socket);
4716 return mfree(rt);
4717 }
4718
4719 static void exec_runtime_freep(ExecRuntime **rt) {
4720 if (*rt)
4721 (void) exec_runtime_free(*rt, false);
4722 }
4723
4724 static int exec_runtime_allocate(ExecRuntime **rt) {
4725 assert(rt);
4726
4727 *rt = new0(ExecRuntime, 1);
4728 if (!*rt)
4729 return -ENOMEM;
4730
4731 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4732 return 0;
4733 }
4734
4735 static int exec_runtime_add(
4736 Manager *m,
4737 const char *id,
4738 const char *tmp_dir,
4739 const char *var_tmp_dir,
4740 const int netns_storage_socket[2],
4741 ExecRuntime **ret) {
4742
4743 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4744 int r;
4745
4746 assert(m);
4747 assert(id);
4748
4749 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4750 if (r < 0)
4751 return r;
4752
4753 r = exec_runtime_allocate(&rt);
4754 if (r < 0)
4755 return r;
4756
4757 rt->id = strdup(id);
4758 if (!rt->id)
4759 return -ENOMEM;
4760
4761 if (tmp_dir) {
4762 rt->tmp_dir = strdup(tmp_dir);
4763 if (!rt->tmp_dir)
4764 return -ENOMEM;
4765
4766 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4767 assert(var_tmp_dir);
4768 rt->var_tmp_dir = strdup(var_tmp_dir);
4769 if (!rt->var_tmp_dir)
4770 return -ENOMEM;
4771 }
4772
4773 if (netns_storage_socket) {
4774 rt->netns_storage_socket[0] = netns_storage_socket[0];
4775 rt->netns_storage_socket[1] = netns_storage_socket[1];
4776 }
4777
4778 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4779 if (r < 0)
4780 return r;
4781
4782 rt->manager = m;
4783
4784 if (ret)
4785 *ret = rt;
4786
4787 /* do not remove created ExecRuntime object when the operation succeeds. */
4788 rt = NULL;
4789 return 0;
4790 }
4791
4792 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4793 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4794 _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4795 int r;
4796
4797 assert(m);
4798 assert(c);
4799 assert(id);
4800
4801 /* It is not necessary to create ExecRuntime object. */
4802 if (!c->private_network && !c->private_tmp)
4803 return 0;
4804
4805 if (c->private_tmp) {
4806 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4807 if (r < 0)
4808 return r;
4809 }
4810
4811 if (c->private_network) {
4812 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4813 return -errno;
4814 }
4815
4816 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4817 if (r < 0)
4818 return r;
4819
4820 /* Avoid cleanup */
4821 netns_storage_socket[0] = -1;
4822 netns_storage_socket[1] = -1;
4823 return 1;
4824 }
4825
4826 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4827 ExecRuntime *rt;
4828 int r;
4829
4830 assert(m);
4831 assert(id);
4832 assert(ret);
4833
4834 rt = hashmap_get(m->exec_runtime_by_id, id);
4835 if (rt)
4836 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4837 goto ref;
4838
4839 if (!create)
4840 return 0;
4841
4842 /* If not found, then create a new object. */
4843 r = exec_runtime_make(m, c, id, &rt);
4844 if (r <= 0)
4845 /* When r == 0, it is not necessary to create ExecRuntime object. */
4846 return r;
4847
4848 ref:
4849 /* increment reference counter. */
4850 rt->n_ref++;
4851 *ret = rt;
4852 return 1;
4853 }
4854
4855 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4856 if (!rt)
4857 return NULL;
4858
4859 assert(rt->n_ref > 0);
4860
4861 rt->n_ref--;
4862 if (rt->n_ref > 0)
4863 return NULL;
4864
4865 return exec_runtime_free(rt, destroy);
4866 }
4867
4868 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4869 ExecRuntime *rt;
4870 Iterator i;
4871
4872 assert(m);
4873 assert(f);
4874 assert(fds);
4875
4876 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4877 fprintf(f, "exec-runtime=%s", rt->id);
4878
4879 if (rt->tmp_dir)
4880 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4881
4882 if (rt->var_tmp_dir)
4883 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4884
4885 if (rt->netns_storage_socket[0] >= 0) {
4886 int copy;
4887
4888 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4889 if (copy < 0)
4890 return copy;
4891
4892 fprintf(f, " netns-socket-0=%i", copy);
4893 }
4894
4895 if (rt->netns_storage_socket[1] >= 0) {
4896 int copy;
4897
4898 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4899 if (copy < 0)
4900 return copy;
4901
4902 fprintf(f, " netns-socket-1=%i", copy);
4903 }
4904
4905 fputc('\n', f);
4906 }
4907
4908 return 0;
4909 }
4910
4911 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4912 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4913 ExecRuntime *rt;
4914 int r;
4915
4916 /* This is for the migration from old (v237 or earlier) deserialization text.
4917 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4918 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4919 * so or not from the serialized text, then we always creates a new object owned by this. */
4920
4921 assert(u);
4922 assert(key);
4923 assert(value);
4924
4925 /* Manager manages ExecRuntime objects by the unit id.
4926 * So, we omit the serialized text when the unit does not have id (yet?)... */
4927 if (isempty(u->id)) {
4928 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4929 return 0;
4930 }
4931
4932 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4933 if (r < 0) {
4934 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4935 return 0;
4936 }
4937
4938 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4939 if (!rt) {
4940 r = exec_runtime_allocate(&rt_create);
4941 if (r < 0)
4942 return log_oom();
4943
4944 rt_create->id = strdup(u->id);
4945 if (!rt_create->id)
4946 return log_oom();
4947
4948 rt = rt_create;
4949 }
4950
4951 if (streq(key, "tmp-dir")) {
4952 char *copy;
4953
4954 copy = strdup(value);
4955 if (!copy)
4956 return log_oom();
4957
4958 free_and_replace(rt->tmp_dir, copy);
4959
4960 } else if (streq(key, "var-tmp-dir")) {
4961 char *copy;
4962
4963 copy = strdup(value);
4964 if (!copy)
4965 return log_oom();
4966
4967 free_and_replace(rt->var_tmp_dir, copy);
4968
4969 } else if (streq(key, "netns-socket-0")) {
4970 int fd;
4971
4972 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4973 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4974 return 0;
4975 }
4976
4977 safe_close(rt->netns_storage_socket[0]);
4978 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4979
4980 } else if (streq(key, "netns-socket-1")) {
4981 int fd;
4982
4983 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4984 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4985 return 0;
4986 }
4987
4988 safe_close(rt->netns_storage_socket[1]);
4989 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4990 } else
4991 return 0;
4992
4993 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4994 if (rt_create) {
4995 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4996 if (r < 0) {
4997 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
4998 return 0;
4999 }
5000
5001 rt_create->manager = u->manager;
5002
5003 /* Avoid cleanup */
5004 rt_create = NULL;
5005 }
5006
5007 return 1;
5008 }
5009
5010 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5011 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5012 int r, fd0 = -1, fd1 = -1;
5013 const char *p, *v = value;
5014 size_t n;
5015
5016 assert(m);
5017 assert(value);
5018 assert(fds);
5019
5020 n = strcspn(v, " ");
5021 id = strndupa(v, n);
5022 if (v[n] != ' ')
5023 goto finalize;
5024 p = v + n + 1;
5025
5026 v = startswith(p, "tmp-dir=");
5027 if (v) {
5028 n = strcspn(v, " ");
5029 tmp_dir = strndupa(v, n);
5030 if (v[n] != ' ')
5031 goto finalize;
5032 p = v + n + 1;
5033 }
5034
5035 v = startswith(p, "var-tmp-dir=");
5036 if (v) {
5037 n = strcspn(v, " ");
5038 var_tmp_dir = strndupa(v, n);
5039 if (v[n] != ' ')
5040 goto finalize;
5041 p = v + n + 1;
5042 }
5043
5044 v = startswith(p, "netns-socket-0=");
5045 if (v) {
5046 char *buf;
5047
5048 n = strcspn(v, " ");
5049 buf = strndupa(v, n);
5050 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5051 log_debug("Unable to process exec-runtime netns fd specification.");
5052 return;
5053 }
5054 fd0 = fdset_remove(fds, fd0);
5055 if (v[n] != ' ')
5056 goto finalize;
5057 p = v + n + 1;
5058 }
5059
5060 v = startswith(p, "netns-socket-1=");
5061 if (v) {
5062 char *buf;
5063
5064 n = strcspn(v, " ");
5065 buf = strndupa(v, n);
5066 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5067 log_debug("Unable to process exec-runtime netns fd specification.");
5068 return;
5069 }
5070 fd1 = fdset_remove(fds, fd1);
5071 }
5072
5073 finalize:
5074
5075 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5076 if (r < 0) {
5077 log_debug_errno(r, "Failed to add exec-runtime: %m");
5078 return;
5079 }
5080 }
5081
5082 void exec_runtime_vacuum(Manager *m) {
5083 ExecRuntime *rt;
5084 Iterator i;
5085
5086 assert(m);
5087
5088 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5089
5090 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5091 if (rt->n_ref > 0)
5092 continue;
5093
5094 (void) exec_runtime_free(rt, false);
5095 }
5096 }
5097
5098 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5099 [EXEC_INPUT_NULL] = "null",
5100 [EXEC_INPUT_TTY] = "tty",
5101 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5102 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5103 [EXEC_INPUT_SOCKET] = "socket",
5104 [EXEC_INPUT_NAMED_FD] = "fd",
5105 [EXEC_INPUT_DATA] = "data",
5106 [EXEC_INPUT_FILE] = "file",
5107 };
5108
5109 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5110
5111 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5112 [EXEC_OUTPUT_INHERIT] = "inherit",
5113 [EXEC_OUTPUT_NULL] = "null",
5114 [EXEC_OUTPUT_TTY] = "tty",
5115 [EXEC_OUTPUT_SYSLOG] = "syslog",
5116 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5117 [EXEC_OUTPUT_KMSG] = "kmsg",
5118 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5119 [EXEC_OUTPUT_JOURNAL] = "journal",
5120 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5121 [EXEC_OUTPUT_SOCKET] = "socket",
5122 [EXEC_OUTPUT_NAMED_FD] = "fd",
5123 [EXEC_OUTPUT_FILE] = "file",
5124 [EXEC_OUTPUT_FILE_APPEND] = "append",
5125 };
5126
5127 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5128
5129 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5130 [EXEC_UTMP_INIT] = "init",
5131 [EXEC_UTMP_LOGIN] = "login",
5132 [EXEC_UTMP_USER] = "user",
5133 };
5134
5135 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5136
5137 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5138 [EXEC_PRESERVE_NO] = "no",
5139 [EXEC_PRESERVE_YES] = "yes",
5140 [EXEC_PRESERVE_RESTART] = "restart",
5141 };
5142
5143 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5144
5145 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5146 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5147 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5148 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5149 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5150 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5151 };
5152
5153 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5154
5155 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5156 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5157 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5158 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5159 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5160 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5161 };
5162
5163 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5164
5165 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5166 [EXEC_KEYRING_INHERIT] = "inherit",
5167 [EXEC_KEYRING_PRIVATE] = "private",
5168 [EXEC_KEYRING_SHARED] = "shared",
5169 };
5170
5171 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);