]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #9624 from poettering/service-state-flush
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <glob.h>
6 #include <grp.h>
7 #include <poll.h>
8 #include <signal.h>
9 #include <string.h>
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
12 #include <sys/mman.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/shm.h>
16 #include <sys/socket.h>
17 #include <sys/stat.h>
18 #include <sys/types.h>
19 #include <sys/un.h>
20 #include <unistd.h>
21 #include <utmpx.h>
22
23 #if HAVE_PAM
24 #include <security/pam_appl.h>
25 #endif
26
27 #if HAVE_SELINUX
28 #include <selinux/selinux.h>
29 #endif
30
31 #if HAVE_SECCOMP
32 #include <seccomp.h>
33 #endif
34
35 #if HAVE_APPARMOR
36 #include <sys/apparmor.h>
37 #endif
38
39 #include "sd-messages.h"
40
41 #include "af-list.h"
42 #include "alloc-util.h"
43 #if HAVE_APPARMOR
44 #include "apparmor-util.h"
45 #endif
46 #include "async.h"
47 #include "barrier.h"
48 #include "cap-list.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cpu-set-util.h"
52 #include "def.h"
53 #include "env-util.h"
54 #include "errno-list.h"
55 #include "execute.h"
56 #include "exit-status.h"
57 #include "fd-util.h"
58 #include "fileio.h"
59 #include "format-util.h"
60 #include "fs-util.h"
61 #include "glob-util.h"
62 #include "io-util.h"
63 #include "ioprio.h"
64 #include "label.h"
65 #include "log.h"
66 #include "macro.h"
67 #include "manager.h"
68 #include "missing.h"
69 #include "mkdir.h"
70 #include "namespace.h"
71 #include "parse-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
74 #include "rlimit-util.h"
75 #include "rm-rf.h"
76 #if HAVE_SECCOMP
77 #include "seccomp-util.h"
78 #endif
79 #include "securebits.h"
80 #include "securebits-util.h"
81 #include "selinux-util.h"
82 #include "signal-util.h"
83 #include "smack-util.h"
84 #include "socket-util.h"
85 #include "special.h"
86 #include "stat-util.h"
87 #include "string-table.h"
88 #include "string-util.h"
89 #include "strv.h"
90 #include "syslog-util.h"
91 #include "terminal-util.h"
92 #include "umask-util.h"
93 #include "unit.h"
94 #include "user-util.h"
95 #include "util.h"
96 #include "utmp-wtmp.h"
97
98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
100
101 /* This assumes there is a 'tty' group */
102 #define TTY_MODE 0620
103
104 #define SNDBUF_SIZE (8*1024*1024)
105
106 static int shift_fds(int fds[], size_t n_fds) {
107 int start, restart_from;
108
109 if (n_fds <= 0)
110 return 0;
111
112 /* Modifies the fds array! (sorts it) */
113
114 assert(fds);
115
116 start = 0;
117 for (;;) {
118 int i;
119
120 restart_from = -1;
121
122 for (i = start; i < (int) n_fds; i++) {
123 int nfd;
124
125 /* Already at right index? */
126 if (fds[i] == i+3)
127 continue;
128
129 nfd = fcntl(fds[i], F_DUPFD, i + 3);
130 if (nfd < 0)
131 return -errno;
132
133 safe_close(fds[i]);
134 fds[i] = nfd;
135
136 /* Hmm, the fd we wanted isn't free? Then
137 * let's remember that and try again from here */
138 if (nfd != i+3 && restart_from < 0)
139 restart_from = i;
140 }
141
142 if (restart_from < 0)
143 break;
144
145 start = restart_from;
146 }
147
148 return 0;
149 }
150
151 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
152 size_t i, n_fds;
153 int r;
154
155 n_fds = n_socket_fds + n_storage_fds;
156 if (n_fds <= 0)
157 return 0;
158
159 assert(fds);
160
161 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
162 * O_NONBLOCK only applies to socket activation though. */
163
164 for (i = 0; i < n_fds; i++) {
165
166 if (i < n_socket_fds) {
167 r = fd_nonblock(fds[i], nonblock);
168 if (r < 0)
169 return r;
170 }
171
172 /* We unconditionally drop FD_CLOEXEC from the fds,
173 * since after all we want to pass these fds to our
174 * children */
175
176 r = fd_cloexec(fds[i], false);
177 if (r < 0)
178 return r;
179 }
180
181 return 0;
182 }
183
184 static const char *exec_context_tty_path(const ExecContext *context) {
185 assert(context);
186
187 if (context->stdio_as_fds)
188 return NULL;
189
190 if (context->tty_path)
191 return context->tty_path;
192
193 return "/dev/console";
194 }
195
196 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
197 const char *path;
198
199 assert(context);
200
201 path = exec_context_tty_path(context);
202
203 if (context->tty_vhangup) {
204 if (p && p->stdin_fd >= 0)
205 (void) terminal_vhangup_fd(p->stdin_fd);
206 else if (path)
207 (void) terminal_vhangup(path);
208 }
209
210 if (context->tty_reset) {
211 if (p && p->stdin_fd >= 0)
212 (void) reset_terminal_fd(p->stdin_fd, true);
213 else if (path)
214 (void) reset_terminal(path);
215 }
216
217 if (context->tty_vt_disallocate && path)
218 (void) vt_disallocate(path);
219 }
220
221 static bool is_terminal_input(ExecInput i) {
222 return IN_SET(i,
223 EXEC_INPUT_TTY,
224 EXEC_INPUT_TTY_FORCE,
225 EXEC_INPUT_TTY_FAIL);
226 }
227
228 static bool is_terminal_output(ExecOutput o) {
229 return IN_SET(o,
230 EXEC_OUTPUT_TTY,
231 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
232 EXEC_OUTPUT_KMSG_AND_CONSOLE,
233 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
234 }
235
236 static bool is_syslog_output(ExecOutput o) {
237 return IN_SET(o,
238 EXEC_OUTPUT_SYSLOG,
239 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
240 }
241
242 static bool is_kmsg_output(ExecOutput o) {
243 return IN_SET(o,
244 EXEC_OUTPUT_KMSG,
245 EXEC_OUTPUT_KMSG_AND_CONSOLE);
246 }
247
248 static bool exec_context_needs_term(const ExecContext *c) {
249 assert(c);
250
251 /* Return true if the execution context suggests we should set $TERM to something useful. */
252
253 if (is_terminal_input(c->std_input))
254 return true;
255
256 if (is_terminal_output(c->std_output))
257 return true;
258
259 if (is_terminal_output(c->std_error))
260 return true;
261
262 return !!c->tty_path;
263 }
264
265 static int open_null_as(int flags, int nfd) {
266 int fd;
267
268 assert(nfd >= 0);
269
270 fd = open("/dev/null", flags|O_NOCTTY);
271 if (fd < 0)
272 return -errno;
273
274 return move_fd(fd, nfd, false);
275 }
276
277 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
278 static const union sockaddr_union sa = {
279 .un.sun_family = AF_UNIX,
280 .un.sun_path = "/run/systemd/journal/stdout",
281 };
282 uid_t olduid = UID_INVALID;
283 gid_t oldgid = GID_INVALID;
284 int r;
285
286 if (gid_is_valid(gid)) {
287 oldgid = getgid();
288
289 if (setegid(gid) < 0)
290 return -errno;
291 }
292
293 if (uid_is_valid(uid)) {
294 olduid = getuid();
295
296 if (seteuid(uid) < 0) {
297 r = -errno;
298 goto restore_gid;
299 }
300 }
301
302 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
303
304 /* If we fail to restore the uid or gid, things will likely
305 fail later on. This should only happen if an LSM interferes. */
306
307 if (uid_is_valid(uid))
308 (void) seteuid(olduid);
309
310 restore_gid:
311 if (gid_is_valid(gid))
312 (void) setegid(oldgid);
313
314 return r;
315 }
316
317 static int connect_logger_as(
318 const Unit *unit,
319 const ExecContext *context,
320 const ExecParameters *params,
321 ExecOutput output,
322 const char *ident,
323 int nfd,
324 uid_t uid,
325 gid_t gid) {
326
327 int fd, r;
328
329 assert(context);
330 assert(params);
331 assert(output < _EXEC_OUTPUT_MAX);
332 assert(ident);
333 assert(nfd >= 0);
334
335 fd = socket(AF_UNIX, SOCK_STREAM, 0);
336 if (fd < 0)
337 return -errno;
338
339 r = connect_journal_socket(fd, uid, gid);
340 if (r < 0)
341 return r;
342
343 if (shutdown(fd, SHUT_RD) < 0) {
344 safe_close(fd);
345 return -errno;
346 }
347
348 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
349
350 dprintf(fd,
351 "%s\n"
352 "%s\n"
353 "%i\n"
354 "%i\n"
355 "%i\n"
356 "%i\n"
357 "%i\n",
358 context->syslog_identifier ?: ident,
359 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
360 context->syslog_priority,
361 !!context->syslog_level_prefix,
362 is_syslog_output(output),
363 is_kmsg_output(output),
364 is_terminal_output(output));
365
366 return move_fd(fd, nfd, false);
367 }
368 static int open_terminal_as(const char *path, int flags, int nfd) {
369 int fd;
370
371 assert(path);
372 assert(nfd >= 0);
373
374 fd = open_terminal(path, flags | O_NOCTTY);
375 if (fd < 0)
376 return fd;
377
378 return move_fd(fd, nfd, false);
379 }
380
381 static int acquire_path(const char *path, int flags, mode_t mode) {
382 union sockaddr_union sa = {
383 .sa.sa_family = AF_UNIX,
384 };
385 int fd, r;
386
387 assert(path);
388
389 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
390 flags |= O_CREAT;
391
392 fd = open(path, flags|O_NOCTTY, mode);
393 if (fd >= 0)
394 return fd;
395
396 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
397 return -errno;
398 if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
399 return -ENXIO;
400
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
402
403 fd = socket(AF_UNIX, SOCK_STREAM, 0);
404 if (fd < 0)
405 return -errno;
406
407 strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
408 if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
409 safe_close(fd);
410 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
411 * indication that his wasn't an AF_UNIX socket after all */
412 }
413
414 if ((flags & O_ACCMODE) == O_RDONLY)
415 r = shutdown(fd, SHUT_WR);
416 else if ((flags & O_ACCMODE) == O_WRONLY)
417 r = shutdown(fd, SHUT_RD);
418 else
419 return fd;
420 if (r < 0) {
421 safe_close(fd);
422 return -errno;
423 }
424
425 return fd;
426 }
427
428 static int fixup_input(
429 const ExecContext *context,
430 int socket_fd,
431 bool apply_tty_stdin) {
432
433 ExecInput std_input;
434
435 assert(context);
436
437 std_input = context->std_input;
438
439 if (is_terminal_input(std_input) && !apply_tty_stdin)
440 return EXEC_INPUT_NULL;
441
442 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
443 return EXEC_INPUT_NULL;
444
445 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
446 return EXEC_INPUT_NULL;
447
448 return std_input;
449 }
450
451 static int fixup_output(ExecOutput std_output, int socket_fd) {
452
453 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
454 return EXEC_OUTPUT_INHERIT;
455
456 return std_output;
457 }
458
459 static int setup_input(
460 const ExecContext *context,
461 const ExecParameters *params,
462 int socket_fd,
463 int named_iofds[3]) {
464
465 ExecInput i;
466
467 assert(context);
468 assert(params);
469
470 if (params->stdin_fd >= 0) {
471 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
472 return -errno;
473
474 /* Try to make this the controlling tty, if it is a tty, and reset it */
475 if (isatty(STDIN_FILENO)) {
476 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
477 (void) reset_terminal_fd(STDIN_FILENO, true);
478 }
479
480 return STDIN_FILENO;
481 }
482
483 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
484
485 switch (i) {
486
487 case EXEC_INPUT_NULL:
488 return open_null_as(O_RDONLY, STDIN_FILENO);
489
490 case EXEC_INPUT_TTY:
491 case EXEC_INPUT_TTY_FORCE:
492 case EXEC_INPUT_TTY_FAIL: {
493 int fd;
494
495 fd = acquire_terminal(exec_context_tty_path(context),
496 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
497 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
498 ACQUIRE_TERMINAL_WAIT,
499 USEC_INFINITY);
500 if (fd < 0)
501 return fd;
502
503 return move_fd(fd, STDIN_FILENO, false);
504 }
505
506 case EXEC_INPUT_SOCKET:
507 assert(socket_fd >= 0);
508
509 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
510
511 case EXEC_INPUT_NAMED_FD:
512 assert(named_iofds[STDIN_FILENO] >= 0);
513
514 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
515 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
516
517 case EXEC_INPUT_DATA: {
518 int fd;
519
520 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
521 if (fd < 0)
522 return fd;
523
524 return move_fd(fd, STDIN_FILENO, false);
525 }
526
527 case EXEC_INPUT_FILE: {
528 bool rw;
529 int fd;
530
531 assert(context->stdio_file[STDIN_FILENO]);
532
533 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
534 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
535
536 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
537 if (fd < 0)
538 return fd;
539
540 return move_fd(fd, STDIN_FILENO, false);
541 }
542
543 default:
544 assert_not_reached("Unknown input type");
545 }
546 }
547
548 static int setup_output(
549 const Unit *unit,
550 const ExecContext *context,
551 const ExecParameters *params,
552 int fileno,
553 int socket_fd,
554 int named_iofds[3],
555 const char *ident,
556 uid_t uid,
557 gid_t gid,
558 dev_t *journal_stream_dev,
559 ino_t *journal_stream_ino) {
560
561 ExecOutput o;
562 ExecInput i;
563 int r;
564
565 assert(unit);
566 assert(context);
567 assert(params);
568 assert(ident);
569 assert(journal_stream_dev);
570 assert(journal_stream_ino);
571
572 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
573
574 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
575 return -errno;
576
577 return STDOUT_FILENO;
578 }
579
580 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
581 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
582 return -errno;
583
584 return STDERR_FILENO;
585 }
586
587 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
588 o = fixup_output(context->std_output, socket_fd);
589
590 if (fileno == STDERR_FILENO) {
591 ExecOutput e;
592 e = fixup_output(context->std_error, socket_fd);
593
594 /* This expects the input and output are already set up */
595
596 /* Don't change the stderr file descriptor if we inherit all
597 * the way and are not on a tty */
598 if (e == EXEC_OUTPUT_INHERIT &&
599 o == EXEC_OUTPUT_INHERIT &&
600 i == EXEC_INPUT_NULL &&
601 !is_terminal_input(context->std_input) &&
602 getppid () != 1)
603 return fileno;
604
605 /* Duplicate from stdout if possible */
606 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
607 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
608
609 o = e;
610
611 } else if (o == EXEC_OUTPUT_INHERIT) {
612 /* If input got downgraded, inherit the original value */
613 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
614 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
615
616 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
617 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
618 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
619
620 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
621 if (getppid() != 1)
622 return fileno;
623
624 /* We need to open /dev/null here anew, to get the right access mode. */
625 return open_null_as(O_WRONLY, fileno);
626 }
627
628 switch (o) {
629
630 case EXEC_OUTPUT_NULL:
631 return open_null_as(O_WRONLY, fileno);
632
633 case EXEC_OUTPUT_TTY:
634 if (is_terminal_input(i))
635 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
636
637 /* We don't reset the terminal if this is just about output */
638 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
639
640 case EXEC_OUTPUT_SYSLOG:
641 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
642 case EXEC_OUTPUT_KMSG:
643 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
644 case EXEC_OUTPUT_JOURNAL:
645 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
646 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
647 if (r < 0) {
648 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
649 r = open_null_as(O_WRONLY, fileno);
650 } else {
651 struct stat st;
652
653 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
654 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
655 * services to detect whether they are connected to the journal or not.
656 *
657 * If both stdout and stderr are connected to a stream then let's make sure to store the data
658 * about STDERR as that's usually the best way to do logging. */
659
660 if (fstat(fileno, &st) >= 0 &&
661 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
662 *journal_stream_dev = st.st_dev;
663 *journal_stream_ino = st.st_ino;
664 }
665 }
666 return r;
667
668 case EXEC_OUTPUT_SOCKET:
669 assert(socket_fd >= 0);
670
671 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
672
673 case EXEC_OUTPUT_NAMED_FD:
674 assert(named_iofds[fileno] >= 0);
675
676 (void) fd_nonblock(named_iofds[fileno], false);
677 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
678
679 case EXEC_OUTPUT_FILE:
680 case EXEC_OUTPUT_FILE_APPEND: {
681 bool rw;
682 int fd, flags;
683
684 assert(context->stdio_file[fileno]);
685
686 rw = context->std_input == EXEC_INPUT_FILE &&
687 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
688
689 if (rw)
690 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
691
692 flags = O_WRONLY;
693 if (o == EXEC_OUTPUT_FILE_APPEND)
694 flags |= O_APPEND;
695
696 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
697
698 if (fd < 0)
699 return fd;
700
701 return move_fd(fd, fileno, 0);
702 }
703
704 default:
705 assert_not_reached("Unknown error type");
706 }
707 }
708
709 static int chown_terminal(int fd, uid_t uid) {
710 struct stat st;
711
712 assert(fd >= 0);
713
714 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
715 if (isatty(fd) < 1)
716 return 0;
717
718 /* This might fail. What matters are the results. */
719 (void) fchown(fd, uid, -1);
720 (void) fchmod(fd, TTY_MODE);
721
722 if (fstat(fd, &st) < 0)
723 return -errno;
724
725 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
726 return -EPERM;
727
728 return 0;
729 }
730
731 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
732 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
733 int r;
734
735 assert(_saved_stdin);
736 assert(_saved_stdout);
737
738 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
739 if (saved_stdin < 0)
740 return -errno;
741
742 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
743 if (saved_stdout < 0)
744 return -errno;
745
746 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
747 if (fd < 0)
748 return fd;
749
750 r = chown_terminal(fd, getuid());
751 if (r < 0)
752 return r;
753
754 r = reset_terminal_fd(fd, true);
755 if (r < 0)
756 return r;
757
758 r = rearrange_stdio(fd, fd, STDERR_FILENO);
759 fd = -1;
760 if (r < 0)
761 return r;
762
763 *_saved_stdin = saved_stdin;
764 *_saved_stdout = saved_stdout;
765
766 saved_stdin = saved_stdout = -1;
767
768 return 0;
769 }
770
771 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
772 assert(err < 0);
773
774 if (err == -ETIMEDOUT)
775 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
776 else {
777 errno = -err;
778 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
779 }
780 }
781
782 static void write_confirm_error(int err, const char *vc, const Unit *u) {
783 _cleanup_close_ int fd = -1;
784
785 assert(vc);
786
787 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
788 if (fd < 0)
789 return;
790
791 write_confirm_error_fd(err, fd, u);
792 }
793
794 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
795 int r = 0;
796
797 assert(saved_stdin);
798 assert(saved_stdout);
799
800 release_terminal();
801
802 if (*saved_stdin >= 0)
803 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
804 r = -errno;
805
806 if (*saved_stdout >= 0)
807 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
808 r = -errno;
809
810 *saved_stdin = safe_close(*saved_stdin);
811 *saved_stdout = safe_close(*saved_stdout);
812
813 return r;
814 }
815
816 enum {
817 CONFIRM_PRETEND_FAILURE = -1,
818 CONFIRM_PRETEND_SUCCESS = 0,
819 CONFIRM_EXECUTE = 1,
820 };
821
822 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
823 int saved_stdout = -1, saved_stdin = -1, r;
824 _cleanup_free_ char *e = NULL;
825 char c;
826
827 /* For any internal errors, assume a positive response. */
828 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
829 if (r < 0) {
830 write_confirm_error(r, vc, u);
831 return CONFIRM_EXECUTE;
832 }
833
834 /* confirm_spawn might have been disabled while we were sleeping. */
835 if (manager_is_confirm_spawn_disabled(u->manager)) {
836 r = 1;
837 goto restore_stdio;
838 }
839
840 e = ellipsize(cmdline, 60, 100);
841 if (!e) {
842 log_oom();
843 r = CONFIRM_EXECUTE;
844 goto restore_stdio;
845 }
846
847 for (;;) {
848 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
849 if (r < 0) {
850 write_confirm_error_fd(r, STDOUT_FILENO, u);
851 r = CONFIRM_EXECUTE;
852 goto restore_stdio;
853 }
854
855 switch (c) {
856 case 'c':
857 printf("Resuming normal execution.\n");
858 manager_disable_confirm_spawn();
859 r = 1;
860 break;
861 case 'D':
862 unit_dump(u, stdout, " ");
863 continue; /* ask again */
864 case 'f':
865 printf("Failing execution.\n");
866 r = CONFIRM_PRETEND_FAILURE;
867 break;
868 case 'h':
869 printf(" c - continue, proceed without asking anymore\n"
870 " D - dump, show the state of the unit\n"
871 " f - fail, don't execute the command and pretend it failed\n"
872 " h - help\n"
873 " i - info, show a short summary of the unit\n"
874 " j - jobs, show jobs that are in progress\n"
875 " s - skip, don't execute the command and pretend it succeeded\n"
876 " y - yes, execute the command\n");
877 continue; /* ask again */
878 case 'i':
879 printf(" Description: %s\n"
880 " Unit: %s\n"
881 " Command: %s\n",
882 u->id, u->description, cmdline);
883 continue; /* ask again */
884 case 'j':
885 manager_dump_jobs(u->manager, stdout, " ");
886 continue; /* ask again */
887 case 'n':
888 /* 'n' was removed in favor of 'f'. */
889 printf("Didn't understand 'n', did you mean 'f'?\n");
890 continue; /* ask again */
891 case 's':
892 printf("Skipping execution.\n");
893 r = CONFIRM_PRETEND_SUCCESS;
894 break;
895 case 'y':
896 r = CONFIRM_EXECUTE;
897 break;
898 default:
899 assert_not_reached("Unhandled choice");
900 }
901 break;
902 }
903
904 restore_stdio:
905 restore_confirm_stdio(&saved_stdin, &saved_stdout);
906 return r;
907 }
908
909 static int get_fixed_user(const ExecContext *c, const char **user,
910 uid_t *uid, gid_t *gid,
911 const char **home, const char **shell) {
912 int r;
913 const char *name;
914
915 assert(c);
916
917 if (!c->user)
918 return 0;
919
920 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
921 * (i.e. are "/" or "/bin/nologin"). */
922
923 name = c->user;
924 r = get_user_creds_clean(&name, uid, gid, home, shell);
925 if (r < 0)
926 return r;
927
928 *user = name;
929 return 0;
930 }
931
932 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
933 int r;
934 const char *name;
935
936 assert(c);
937
938 if (!c->group)
939 return 0;
940
941 name = c->group;
942 r = get_group_creds(&name, gid);
943 if (r < 0)
944 return r;
945
946 *group = name;
947 return 0;
948 }
949
950 static int get_supplementary_groups(const ExecContext *c, const char *user,
951 const char *group, gid_t gid,
952 gid_t **supplementary_gids, int *ngids) {
953 char **i;
954 int r, k = 0;
955 int ngroups_max;
956 bool keep_groups = false;
957 gid_t *groups = NULL;
958 _cleanup_free_ gid_t *l_gids = NULL;
959
960 assert(c);
961
962 /*
963 * If user is given, then lookup GID and supplementary groups list.
964 * We avoid NSS lookups for gid=0. Also we have to initialize groups
965 * here and as early as possible so we keep the list of supplementary
966 * groups of the caller.
967 */
968 if (user && gid_is_valid(gid) && gid != 0) {
969 /* First step, initialize groups from /etc/groups */
970 if (initgroups(user, gid) < 0)
971 return -errno;
972
973 keep_groups = true;
974 }
975
976 if (strv_isempty(c->supplementary_groups))
977 return 0;
978
979 /*
980 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
981 * be positive, otherwise fail.
982 */
983 errno = 0;
984 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
985 if (ngroups_max <= 0) {
986 if (errno > 0)
987 return -errno;
988 else
989 return -EOPNOTSUPP; /* For all other values */
990 }
991
992 l_gids = new(gid_t, ngroups_max);
993 if (!l_gids)
994 return -ENOMEM;
995
996 if (keep_groups) {
997 /*
998 * Lookup the list of groups that the user belongs to, we
999 * avoid NSS lookups here too for gid=0.
1000 */
1001 k = ngroups_max;
1002 if (getgrouplist(user, gid, l_gids, &k) < 0)
1003 return -EINVAL;
1004 } else
1005 k = 0;
1006
1007 STRV_FOREACH(i, c->supplementary_groups) {
1008 const char *g;
1009
1010 if (k >= ngroups_max)
1011 return -E2BIG;
1012
1013 g = *i;
1014 r = get_group_creds(&g, l_gids+k);
1015 if (r < 0)
1016 return r;
1017
1018 k++;
1019 }
1020
1021 /*
1022 * Sets ngids to zero to drop all supplementary groups, happens
1023 * when we are under root and SupplementaryGroups= is empty.
1024 */
1025 if (k == 0) {
1026 *ngids = 0;
1027 return 0;
1028 }
1029
1030 /* Otherwise get the final list of supplementary groups */
1031 groups = memdup(l_gids, sizeof(gid_t) * k);
1032 if (!groups)
1033 return -ENOMEM;
1034
1035 *supplementary_gids = groups;
1036 *ngids = k;
1037
1038 groups = NULL;
1039
1040 return 0;
1041 }
1042
1043 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1044 int r;
1045
1046 /* Handle SupplementaryGroups= if it is not empty */
1047 if (ngids > 0) {
1048 r = maybe_setgroups(ngids, supplementary_gids);
1049 if (r < 0)
1050 return r;
1051 }
1052
1053 if (gid_is_valid(gid)) {
1054 /* Then set our gids */
1055 if (setresgid(gid, gid, gid) < 0)
1056 return -errno;
1057 }
1058
1059 return 0;
1060 }
1061
1062 static int enforce_user(const ExecContext *context, uid_t uid) {
1063 assert(context);
1064
1065 if (!uid_is_valid(uid))
1066 return 0;
1067
1068 /* Sets (but doesn't look up) the uid and make sure we keep the
1069 * capabilities while doing so. */
1070
1071 if (context->capability_ambient_set != 0) {
1072
1073 /* First step: If we need to keep capabilities but
1074 * drop privileges we need to make sure we keep our
1075 * caps, while we drop privileges. */
1076 if (uid != 0) {
1077 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1078
1079 if (prctl(PR_GET_SECUREBITS) != sb)
1080 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1081 return -errno;
1082 }
1083 }
1084
1085 /* Second step: actually set the uids */
1086 if (setresuid(uid, uid, uid) < 0)
1087 return -errno;
1088
1089 /* At this point we should have all necessary capabilities but
1090 are otherwise a normal user. However, the caps might got
1091 corrupted due to the setresuid() so we need clean them up
1092 later. This is done outside of this call. */
1093
1094 return 0;
1095 }
1096
1097 #if HAVE_PAM
1098
1099 static int null_conv(
1100 int num_msg,
1101 const struct pam_message **msg,
1102 struct pam_response **resp,
1103 void *appdata_ptr) {
1104
1105 /* We don't support conversations */
1106
1107 return PAM_CONV_ERR;
1108 }
1109
1110 #endif
1111
1112 static int setup_pam(
1113 const char *name,
1114 const char *user,
1115 uid_t uid,
1116 gid_t gid,
1117 const char *tty,
1118 char ***env,
1119 int fds[], size_t n_fds) {
1120
1121 #if HAVE_PAM
1122
1123 static const struct pam_conv conv = {
1124 .conv = null_conv,
1125 .appdata_ptr = NULL
1126 };
1127
1128 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1129 pam_handle_t *handle = NULL;
1130 sigset_t old_ss;
1131 int pam_code = PAM_SUCCESS, r;
1132 char **nv, **e = NULL;
1133 bool close_session = false;
1134 pid_t pam_pid = 0, parent_pid;
1135 int flags = 0;
1136
1137 assert(name);
1138 assert(user);
1139 assert(env);
1140
1141 /* We set up PAM in the parent process, then fork. The child
1142 * will then stay around until killed via PR_GET_PDEATHSIG or
1143 * systemd via the cgroup logic. It will then remove the PAM
1144 * session again. The parent process will exec() the actual
1145 * daemon. We do things this way to ensure that the main PID
1146 * of the daemon is the one we initially fork()ed. */
1147
1148 r = barrier_create(&barrier);
1149 if (r < 0)
1150 goto fail;
1151
1152 if (log_get_max_level() < LOG_DEBUG)
1153 flags |= PAM_SILENT;
1154
1155 pam_code = pam_start(name, user, &conv, &handle);
1156 if (pam_code != PAM_SUCCESS) {
1157 handle = NULL;
1158 goto fail;
1159 }
1160
1161 if (tty) {
1162 pam_code = pam_set_item(handle, PAM_TTY, tty);
1163 if (pam_code != PAM_SUCCESS)
1164 goto fail;
1165 }
1166
1167 STRV_FOREACH(nv, *env) {
1168 pam_code = pam_putenv(handle, *nv);
1169 if (pam_code != PAM_SUCCESS)
1170 goto fail;
1171 }
1172
1173 pam_code = pam_acct_mgmt(handle, flags);
1174 if (pam_code != PAM_SUCCESS)
1175 goto fail;
1176
1177 pam_code = pam_open_session(handle, flags);
1178 if (pam_code != PAM_SUCCESS)
1179 goto fail;
1180
1181 close_session = true;
1182
1183 e = pam_getenvlist(handle);
1184 if (!e) {
1185 pam_code = PAM_BUF_ERR;
1186 goto fail;
1187 }
1188
1189 /* Block SIGTERM, so that we know that it won't get lost in
1190 * the child */
1191
1192 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1193
1194 parent_pid = getpid_cached();
1195
1196 r = safe_fork("(sd-pam)", 0, &pam_pid);
1197 if (r < 0)
1198 goto fail;
1199 if (r == 0) {
1200 int sig, ret = EXIT_PAM;
1201
1202 /* The child's job is to reset the PAM session on
1203 * termination */
1204 barrier_set_role(&barrier, BARRIER_CHILD);
1205
1206 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1207 * are open here that have been opened by PAM. */
1208 (void) close_many(fds, n_fds);
1209
1210 /* Drop privileges - we don't need any to pam_close_session
1211 * and this will make PR_SET_PDEATHSIG work in most cases.
1212 * If this fails, ignore the error - but expect sd-pam threads
1213 * to fail to exit normally */
1214
1215 r = maybe_setgroups(0, NULL);
1216 if (r < 0)
1217 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1218 if (setresgid(gid, gid, gid) < 0)
1219 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1220 if (setresuid(uid, uid, uid) < 0)
1221 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1222
1223 (void) ignore_signals(SIGPIPE, -1);
1224
1225 /* Wait until our parent died. This will only work if
1226 * the above setresuid() succeeds, otherwise the kernel
1227 * will not allow unprivileged parents kill their privileged
1228 * children this way. We rely on the control groups kill logic
1229 * to do the rest for us. */
1230 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1231 goto child_finish;
1232
1233 /* Tell the parent that our setup is done. This is especially
1234 * important regarding dropping privileges. Otherwise, unit
1235 * setup might race against our setresuid(2) call.
1236 *
1237 * If the parent aborted, we'll detect this below, hence ignore
1238 * return failure here. */
1239 (void) barrier_place(&barrier);
1240
1241 /* Check if our parent process might already have died? */
1242 if (getppid() == parent_pid) {
1243 sigset_t ss;
1244
1245 assert_se(sigemptyset(&ss) >= 0);
1246 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1247
1248 for (;;) {
1249 if (sigwait(&ss, &sig) < 0) {
1250 if (errno == EINTR)
1251 continue;
1252
1253 goto child_finish;
1254 }
1255
1256 assert(sig == SIGTERM);
1257 break;
1258 }
1259 }
1260
1261 /* If our parent died we'll end the session */
1262 if (getppid() != parent_pid) {
1263 pam_code = pam_close_session(handle, flags);
1264 if (pam_code != PAM_SUCCESS)
1265 goto child_finish;
1266 }
1267
1268 ret = 0;
1269
1270 child_finish:
1271 pam_end(handle, pam_code | flags);
1272 _exit(ret);
1273 }
1274
1275 barrier_set_role(&barrier, BARRIER_PARENT);
1276
1277 /* If the child was forked off successfully it will do all the
1278 * cleanups, so forget about the handle here. */
1279 handle = NULL;
1280
1281 /* Unblock SIGTERM again in the parent */
1282 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1283
1284 /* We close the log explicitly here, since the PAM modules
1285 * might have opened it, but we don't want this fd around. */
1286 closelog();
1287
1288 /* Synchronously wait for the child to initialize. We don't care for
1289 * errors as we cannot recover. However, warn loudly if it happens. */
1290 if (!barrier_place_and_sync(&barrier))
1291 log_error("PAM initialization failed");
1292
1293 return strv_free_and_replace(*env, e);
1294
1295 fail:
1296 if (pam_code != PAM_SUCCESS) {
1297 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1298 r = -EPERM; /* PAM errors do not map to errno */
1299 } else
1300 log_error_errno(r, "PAM failed: %m");
1301
1302 if (handle) {
1303 if (close_session)
1304 pam_code = pam_close_session(handle, flags);
1305
1306 pam_end(handle, pam_code | flags);
1307 }
1308
1309 strv_free(e);
1310 closelog();
1311
1312 return r;
1313 #else
1314 return 0;
1315 #endif
1316 }
1317
1318 static void rename_process_from_path(const char *path) {
1319 char process_name[11];
1320 const char *p;
1321 size_t l;
1322
1323 /* This resulting string must fit in 10 chars (i.e. the length
1324 * of "/sbin/init") to look pretty in /bin/ps */
1325
1326 p = basename(path);
1327 if (isempty(p)) {
1328 rename_process("(...)");
1329 return;
1330 }
1331
1332 l = strlen(p);
1333 if (l > 8) {
1334 /* The end of the process name is usually more
1335 * interesting, since the first bit might just be
1336 * "systemd-" */
1337 p = p + l - 8;
1338 l = 8;
1339 }
1340
1341 process_name[0] = '(';
1342 memcpy(process_name+1, p, l);
1343 process_name[1+l] = ')';
1344 process_name[1+l+1] = 0;
1345
1346 rename_process(process_name);
1347 }
1348
1349 static bool context_has_address_families(const ExecContext *c) {
1350 assert(c);
1351
1352 return c->address_families_whitelist ||
1353 !set_isempty(c->address_families);
1354 }
1355
1356 static bool context_has_syscall_filters(const ExecContext *c) {
1357 assert(c);
1358
1359 return c->syscall_whitelist ||
1360 !hashmap_isempty(c->syscall_filter);
1361 }
1362
1363 static bool context_has_no_new_privileges(const ExecContext *c) {
1364 assert(c);
1365
1366 if (c->no_new_privileges)
1367 return true;
1368
1369 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1370 return false;
1371
1372 /* We need NNP if we have any form of seccomp and are unprivileged */
1373 return context_has_address_families(c) ||
1374 c->memory_deny_write_execute ||
1375 c->restrict_realtime ||
1376 exec_context_restrict_namespaces_set(c) ||
1377 c->protect_kernel_tunables ||
1378 c->protect_kernel_modules ||
1379 c->private_devices ||
1380 context_has_syscall_filters(c) ||
1381 !set_isempty(c->syscall_archs) ||
1382 c->lock_personality;
1383 }
1384
1385 #if HAVE_SECCOMP
1386
1387 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1388
1389 if (is_seccomp_available())
1390 return false;
1391
1392 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1393 return true;
1394 }
1395
1396 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1397 uint32_t negative_action, default_action, action;
1398 int r;
1399
1400 assert(u);
1401 assert(c);
1402
1403 if (!context_has_syscall_filters(c))
1404 return 0;
1405
1406 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1407 return 0;
1408
1409 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1410
1411 if (c->syscall_whitelist) {
1412 default_action = negative_action;
1413 action = SCMP_ACT_ALLOW;
1414 } else {
1415 default_action = SCMP_ACT_ALLOW;
1416 action = negative_action;
1417 }
1418
1419 if (needs_ambient_hack) {
1420 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1421 if (r < 0)
1422 return r;
1423 }
1424
1425 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1426 }
1427
1428 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1429 assert(u);
1430 assert(c);
1431
1432 if (set_isempty(c->syscall_archs))
1433 return 0;
1434
1435 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1436 return 0;
1437
1438 return seccomp_restrict_archs(c->syscall_archs);
1439 }
1440
1441 static int apply_address_families(const Unit* u, const ExecContext *c) {
1442 assert(u);
1443 assert(c);
1444
1445 if (!context_has_address_families(c))
1446 return 0;
1447
1448 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1449 return 0;
1450
1451 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1452 }
1453
1454 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1455 assert(u);
1456 assert(c);
1457
1458 if (!c->memory_deny_write_execute)
1459 return 0;
1460
1461 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1462 return 0;
1463
1464 return seccomp_memory_deny_write_execute();
1465 }
1466
1467 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1468 assert(u);
1469 assert(c);
1470
1471 if (!c->restrict_realtime)
1472 return 0;
1473
1474 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1475 return 0;
1476
1477 return seccomp_restrict_realtime();
1478 }
1479
1480 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1481 assert(u);
1482 assert(c);
1483
1484 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1485 * let's protect even those systems where this is left on in the kernel. */
1486
1487 if (!c->protect_kernel_tunables)
1488 return 0;
1489
1490 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1491 return 0;
1492
1493 return seccomp_protect_sysctl();
1494 }
1495
1496 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1497 assert(u);
1498 assert(c);
1499
1500 /* Turn off module syscalls on ProtectKernelModules=yes */
1501
1502 if (!c->protect_kernel_modules)
1503 return 0;
1504
1505 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1506 return 0;
1507
1508 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1509 }
1510
1511 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1512 assert(u);
1513 assert(c);
1514
1515 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1516
1517 if (!c->private_devices)
1518 return 0;
1519
1520 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1521 return 0;
1522
1523 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1524 }
1525
1526 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1527 assert(u);
1528 assert(c);
1529
1530 if (!exec_context_restrict_namespaces_set(c))
1531 return 0;
1532
1533 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1534 return 0;
1535
1536 return seccomp_restrict_namespaces(c->restrict_namespaces);
1537 }
1538
1539 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1540 unsigned long personality;
1541 int r;
1542
1543 assert(u);
1544 assert(c);
1545
1546 if (!c->lock_personality)
1547 return 0;
1548
1549 if (skip_seccomp_unavailable(u, "LockPersonality="))
1550 return 0;
1551
1552 personality = c->personality;
1553
1554 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1555 if (personality == PERSONALITY_INVALID) {
1556
1557 r = opinionated_personality(&personality);
1558 if (r < 0)
1559 return r;
1560 }
1561
1562 return seccomp_lock_personality(personality);
1563 }
1564
1565 #endif
1566
1567 static void do_idle_pipe_dance(int idle_pipe[4]) {
1568 assert(idle_pipe);
1569
1570 idle_pipe[1] = safe_close(idle_pipe[1]);
1571 idle_pipe[2] = safe_close(idle_pipe[2]);
1572
1573 if (idle_pipe[0] >= 0) {
1574 int r;
1575
1576 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1577
1578 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1579 ssize_t n;
1580
1581 /* Signal systemd that we are bored and want to continue. */
1582 n = write(idle_pipe[3], "x", 1);
1583 if (n > 0)
1584 /* Wait for systemd to react to the signal above. */
1585 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1586 }
1587
1588 idle_pipe[0] = safe_close(idle_pipe[0]);
1589
1590 }
1591
1592 idle_pipe[3] = safe_close(idle_pipe[3]);
1593 }
1594
1595 static int build_environment(
1596 const Unit *u,
1597 const ExecContext *c,
1598 const ExecParameters *p,
1599 size_t n_fds,
1600 const char *home,
1601 const char *username,
1602 const char *shell,
1603 dev_t journal_stream_dev,
1604 ino_t journal_stream_ino,
1605 char ***ret) {
1606
1607 _cleanup_strv_free_ char **our_env = NULL;
1608 size_t n_env = 0;
1609 char *x;
1610
1611 assert(u);
1612 assert(c);
1613 assert(ret);
1614
1615 our_env = new0(char*, 14);
1616 if (!our_env)
1617 return -ENOMEM;
1618
1619 if (n_fds > 0) {
1620 _cleanup_free_ char *joined = NULL;
1621
1622 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1623 return -ENOMEM;
1624 our_env[n_env++] = x;
1625
1626 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1627 return -ENOMEM;
1628 our_env[n_env++] = x;
1629
1630 joined = strv_join(p->fd_names, ":");
1631 if (!joined)
1632 return -ENOMEM;
1633
1634 x = strjoin("LISTEN_FDNAMES=", joined);
1635 if (!x)
1636 return -ENOMEM;
1637 our_env[n_env++] = x;
1638 }
1639
1640 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1641 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1642 return -ENOMEM;
1643 our_env[n_env++] = x;
1644
1645 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1646 return -ENOMEM;
1647 our_env[n_env++] = x;
1648 }
1649
1650 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1651 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1652 * check the database directly. */
1653 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1654 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1655 if (!x)
1656 return -ENOMEM;
1657 our_env[n_env++] = x;
1658 }
1659
1660 if (home) {
1661 x = strappend("HOME=", home);
1662 if (!x)
1663 return -ENOMEM;
1664 our_env[n_env++] = x;
1665 }
1666
1667 if (username) {
1668 x = strappend("LOGNAME=", username);
1669 if (!x)
1670 return -ENOMEM;
1671 our_env[n_env++] = x;
1672
1673 x = strappend("USER=", username);
1674 if (!x)
1675 return -ENOMEM;
1676 our_env[n_env++] = x;
1677 }
1678
1679 if (shell) {
1680 x = strappend("SHELL=", shell);
1681 if (!x)
1682 return -ENOMEM;
1683 our_env[n_env++] = x;
1684 }
1685
1686 if (!sd_id128_is_null(u->invocation_id)) {
1687 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1688 return -ENOMEM;
1689
1690 our_env[n_env++] = x;
1691 }
1692
1693 if (exec_context_needs_term(c)) {
1694 const char *tty_path, *term = NULL;
1695
1696 tty_path = exec_context_tty_path(c);
1697
1698 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1699 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1700 * passes to PID 1 ends up all the way in the console login shown. */
1701
1702 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1703 term = getenv("TERM");
1704 if (!term)
1705 term = default_term_for_tty(tty_path);
1706
1707 x = strappend("TERM=", term);
1708 if (!x)
1709 return -ENOMEM;
1710 our_env[n_env++] = x;
1711 }
1712
1713 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1714 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1715 return -ENOMEM;
1716
1717 our_env[n_env++] = x;
1718 }
1719
1720 our_env[n_env++] = NULL;
1721 assert(n_env <= 12);
1722
1723 *ret = TAKE_PTR(our_env);
1724
1725 return 0;
1726 }
1727
1728 static int build_pass_environment(const ExecContext *c, char ***ret) {
1729 _cleanup_strv_free_ char **pass_env = NULL;
1730 size_t n_env = 0, n_bufsize = 0;
1731 char **i;
1732
1733 STRV_FOREACH(i, c->pass_environment) {
1734 _cleanup_free_ char *x = NULL;
1735 char *v;
1736
1737 v = getenv(*i);
1738 if (!v)
1739 continue;
1740 x = strjoin(*i, "=", v);
1741 if (!x)
1742 return -ENOMEM;
1743
1744 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1745 return -ENOMEM;
1746
1747 pass_env[n_env++] = TAKE_PTR(x);
1748 pass_env[n_env] = NULL;
1749 }
1750
1751 *ret = TAKE_PTR(pass_env);
1752
1753 return 0;
1754 }
1755
1756 static bool exec_needs_mount_namespace(
1757 const ExecContext *context,
1758 const ExecParameters *params,
1759 const ExecRuntime *runtime) {
1760
1761 assert(context);
1762 assert(params);
1763
1764 if (context->root_image)
1765 return true;
1766
1767 if (!strv_isempty(context->read_write_paths) ||
1768 !strv_isempty(context->read_only_paths) ||
1769 !strv_isempty(context->inaccessible_paths))
1770 return true;
1771
1772 if (context->n_bind_mounts > 0)
1773 return true;
1774
1775 if (context->n_temporary_filesystems > 0)
1776 return true;
1777
1778 if (context->mount_flags != 0)
1779 return true;
1780
1781 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1782 return true;
1783
1784 if (context->private_devices ||
1785 context->private_mounts ||
1786 context->protect_system != PROTECT_SYSTEM_NO ||
1787 context->protect_home != PROTECT_HOME_NO ||
1788 context->protect_kernel_tunables ||
1789 context->protect_kernel_modules ||
1790 context->protect_control_groups)
1791 return true;
1792
1793 if (context->root_directory) {
1794 ExecDirectoryType t;
1795
1796 if (context->mount_apivfs)
1797 return true;
1798
1799 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1800 if (!params->prefix[t])
1801 continue;
1802
1803 if (!strv_isempty(context->directories[t].paths))
1804 return true;
1805 }
1806 }
1807
1808 if (context->dynamic_user &&
1809 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1810 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1811 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1812 return true;
1813
1814 return false;
1815 }
1816
1817 static int setup_private_users(uid_t uid, gid_t gid) {
1818 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1819 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1820 _cleanup_close_ int unshare_ready_fd = -1;
1821 _cleanup_(sigkill_waitp) pid_t pid = 0;
1822 uint64_t c = 1;
1823 ssize_t n;
1824 int r;
1825
1826 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1827 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1828 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1829 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1830 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1831 * continues execution normally. */
1832
1833 if (uid != 0 && uid_is_valid(uid)) {
1834 r = asprintf(&uid_map,
1835 "0 0 1\n" /* Map root → root */
1836 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1837 uid, uid);
1838 if (r < 0)
1839 return -ENOMEM;
1840 } else {
1841 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1842 if (!uid_map)
1843 return -ENOMEM;
1844 }
1845
1846 if (gid != 0 && gid_is_valid(gid)) {
1847 r = asprintf(&gid_map,
1848 "0 0 1\n" /* Map root → root */
1849 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1850 gid, gid);
1851 if (r < 0)
1852 return -ENOMEM;
1853 } else {
1854 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1855 if (!gid_map)
1856 return -ENOMEM;
1857 }
1858
1859 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1860 * namespace. */
1861 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1862 if (unshare_ready_fd < 0)
1863 return -errno;
1864
1865 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1866 * failed. */
1867 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1868 return -errno;
1869
1870 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1871 if (r < 0)
1872 return r;
1873 if (r == 0) {
1874 _cleanup_close_ int fd = -1;
1875 const char *a;
1876 pid_t ppid;
1877
1878 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1879 * here, after the parent opened its own user namespace. */
1880
1881 ppid = getppid();
1882 errno_pipe[0] = safe_close(errno_pipe[0]);
1883
1884 /* Wait until the parent unshared the user namespace */
1885 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1886 r = -errno;
1887 goto child_fail;
1888 }
1889
1890 /* Disable the setgroups() system call in the child user namespace, for good. */
1891 a = procfs_file_alloca(ppid, "setgroups");
1892 fd = open(a, O_WRONLY|O_CLOEXEC);
1893 if (fd < 0) {
1894 if (errno != ENOENT) {
1895 r = -errno;
1896 goto child_fail;
1897 }
1898
1899 /* If the file is missing the kernel is too old, let's continue anyway. */
1900 } else {
1901 if (write(fd, "deny\n", 5) < 0) {
1902 r = -errno;
1903 goto child_fail;
1904 }
1905
1906 fd = safe_close(fd);
1907 }
1908
1909 /* First write the GID map */
1910 a = procfs_file_alloca(ppid, "gid_map");
1911 fd = open(a, O_WRONLY|O_CLOEXEC);
1912 if (fd < 0) {
1913 r = -errno;
1914 goto child_fail;
1915 }
1916 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1917 r = -errno;
1918 goto child_fail;
1919 }
1920 fd = safe_close(fd);
1921
1922 /* The write the UID map */
1923 a = procfs_file_alloca(ppid, "uid_map");
1924 fd = open(a, O_WRONLY|O_CLOEXEC);
1925 if (fd < 0) {
1926 r = -errno;
1927 goto child_fail;
1928 }
1929 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1930 r = -errno;
1931 goto child_fail;
1932 }
1933
1934 _exit(EXIT_SUCCESS);
1935
1936 child_fail:
1937 (void) write(errno_pipe[1], &r, sizeof(r));
1938 _exit(EXIT_FAILURE);
1939 }
1940
1941 errno_pipe[1] = safe_close(errno_pipe[1]);
1942
1943 if (unshare(CLONE_NEWUSER) < 0)
1944 return -errno;
1945
1946 /* Let the child know that the namespace is ready now */
1947 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1948 return -errno;
1949
1950 /* Try to read an error code from the child */
1951 n = read(errno_pipe[0], &r, sizeof(r));
1952 if (n < 0)
1953 return -errno;
1954 if (n == sizeof(r)) { /* an error code was sent to us */
1955 if (r < 0)
1956 return r;
1957 return -EIO;
1958 }
1959 if (n != 0) /* on success we should have read 0 bytes */
1960 return -EIO;
1961
1962 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1963 pid = 0;
1964 if (r < 0)
1965 return r;
1966 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
1967 return -EIO;
1968
1969 return 0;
1970 }
1971
1972 static int setup_exec_directory(
1973 const ExecContext *context,
1974 const ExecParameters *params,
1975 uid_t uid,
1976 gid_t gid,
1977 ExecDirectoryType type,
1978 int *exit_status) {
1979
1980 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1981 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1982 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1983 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1984 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1985 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1986 };
1987 char **rt;
1988 int r;
1989
1990 assert(context);
1991 assert(params);
1992 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1993 assert(exit_status);
1994
1995 if (!params->prefix[type])
1996 return 0;
1997
1998 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1999 if (!uid_is_valid(uid))
2000 uid = 0;
2001 if (!gid_is_valid(gid))
2002 gid = 0;
2003 }
2004
2005 STRV_FOREACH(rt, context->directories[type].paths) {
2006 _cleanup_free_ char *p = NULL, *pp = NULL;
2007
2008 p = strjoin(params->prefix[type], "/", *rt);
2009 if (!p) {
2010 r = -ENOMEM;
2011 goto fail;
2012 }
2013
2014 r = mkdir_parents_label(p, 0755);
2015 if (r < 0)
2016 goto fail;
2017
2018 if (context->dynamic_user &&
2019 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2020 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2021
2022 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2023 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2024 * whose UID is later on reused. To lock this down we use the same trick used by container
2025 * managers to prohibit host users to get access to files of the same UID in containers: we
2026 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2027 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2028 * to make this directory permeable for the service itself.
2029 *
2030 * Specifically: for a service which wants a special directory "foo/" we first create a
2031 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2032 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2033 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2034 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2035 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2036 * disabling the access boundary for the service and making sure it only gets access to the
2037 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2038 *
2039 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2040 * owned by the service itself.
2041 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2042 * files or sockets with other services. */
2043
2044 private_root = strjoin(params->prefix[type], "/private");
2045 if (!private_root) {
2046 r = -ENOMEM;
2047 goto fail;
2048 }
2049
2050 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2051 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2052 if (r < 0)
2053 goto fail;
2054
2055 pp = strjoin(private_root, "/", *rt);
2056 if (!pp) {
2057 r = -ENOMEM;
2058 goto fail;
2059 }
2060
2061 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2062 r = mkdir_parents_label(pp, 0755);
2063 if (r < 0)
2064 goto fail;
2065
2066 if (is_dir(p, false) > 0 &&
2067 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2068
2069 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2070 * it over. Most likely the service has been upgraded from one that didn't use
2071 * DynamicUser=1, to one that does. */
2072
2073 if (rename(p, pp) < 0) {
2074 r = -errno;
2075 goto fail;
2076 }
2077 } else {
2078 /* Otherwise, create the actual directory for the service */
2079
2080 r = mkdir_label(pp, context->directories[type].mode);
2081 if (r < 0 && r != -EEXIST)
2082 goto fail;
2083 }
2084
2085 parent = dirname_malloc(p);
2086 if (!parent) {
2087 r = -ENOMEM;
2088 goto fail;
2089 }
2090
2091 r = path_make_relative(parent, pp, &relative);
2092 if (r < 0)
2093 goto fail;
2094
2095 /* And link it up from the original place */
2096 r = symlink_idempotent(relative, p);
2097 if (r < 0)
2098 goto fail;
2099
2100 /* Lock down the access mode */
2101 if (chmod(pp, context->directories[type].mode) < 0) {
2102 r = -errno;
2103 goto fail;
2104 }
2105 } else {
2106 r = mkdir_label(p, context->directories[type].mode);
2107 if (r < 0 && r != -EEXIST)
2108 goto fail;
2109 if (r == -EEXIST && !context->dynamic_user)
2110 continue;
2111 }
2112
2113 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2114 * a service, and shall not be writable. */
2115 if (type == EXEC_DIRECTORY_CONFIGURATION)
2116 continue;
2117
2118 /* Then, change the ownership of the whole tree, if necessary */
2119 r = path_chown_recursive(pp ?: p, uid, gid);
2120 if (r < 0)
2121 goto fail;
2122 }
2123
2124 return 0;
2125
2126 fail:
2127 *exit_status = exit_status_table[type];
2128 return r;
2129 }
2130
2131 #if ENABLE_SMACK
2132 static int setup_smack(
2133 const ExecContext *context,
2134 const ExecCommand *command) {
2135
2136 int r;
2137
2138 assert(context);
2139 assert(command);
2140
2141 if (context->smack_process_label) {
2142 r = mac_smack_apply_pid(0, context->smack_process_label);
2143 if (r < 0)
2144 return r;
2145 }
2146 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2147 else {
2148 _cleanup_free_ char *exec_label = NULL;
2149
2150 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2151 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2152 return r;
2153
2154 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2155 if (r < 0)
2156 return r;
2157 }
2158 #endif
2159
2160 return 0;
2161 }
2162 #endif
2163
2164 static int compile_bind_mounts(
2165 const ExecContext *context,
2166 const ExecParameters *params,
2167 BindMount **ret_bind_mounts,
2168 size_t *ret_n_bind_mounts,
2169 char ***ret_empty_directories) {
2170
2171 _cleanup_strv_free_ char **empty_directories = NULL;
2172 BindMount *bind_mounts;
2173 size_t n, h = 0, i;
2174 ExecDirectoryType t;
2175 int r;
2176
2177 assert(context);
2178 assert(params);
2179 assert(ret_bind_mounts);
2180 assert(ret_n_bind_mounts);
2181 assert(ret_empty_directories);
2182
2183 n = context->n_bind_mounts;
2184 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2185 if (!params->prefix[t])
2186 continue;
2187
2188 n += strv_length(context->directories[t].paths);
2189 }
2190
2191 if (n <= 0) {
2192 *ret_bind_mounts = NULL;
2193 *ret_n_bind_mounts = 0;
2194 *ret_empty_directories = NULL;
2195 return 0;
2196 }
2197
2198 bind_mounts = new(BindMount, n);
2199 if (!bind_mounts)
2200 return -ENOMEM;
2201
2202 for (i = 0; i < context->n_bind_mounts; i++) {
2203 BindMount *item = context->bind_mounts + i;
2204 char *s, *d;
2205
2206 s = strdup(item->source);
2207 if (!s) {
2208 r = -ENOMEM;
2209 goto finish;
2210 }
2211
2212 d = strdup(item->destination);
2213 if (!d) {
2214 free(s);
2215 r = -ENOMEM;
2216 goto finish;
2217 }
2218
2219 bind_mounts[h++] = (BindMount) {
2220 .source = s,
2221 .destination = d,
2222 .read_only = item->read_only,
2223 .recursive = item->recursive,
2224 .ignore_enoent = item->ignore_enoent,
2225 };
2226 }
2227
2228 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2229 char **suffix;
2230
2231 if (!params->prefix[t])
2232 continue;
2233
2234 if (strv_isempty(context->directories[t].paths))
2235 continue;
2236
2237 if (context->dynamic_user &&
2238 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2239 !(context->root_directory || context->root_image)) {
2240 char *private_root;
2241
2242 /* So this is for a dynamic user, and we need to make sure the process can access its own
2243 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2244 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2245
2246 private_root = strjoin(params->prefix[t], "/private");
2247 if (!private_root) {
2248 r = -ENOMEM;
2249 goto finish;
2250 }
2251
2252 r = strv_consume(&empty_directories, private_root);
2253 if (r < 0)
2254 goto finish;
2255 }
2256
2257 STRV_FOREACH(suffix, context->directories[t].paths) {
2258 char *s, *d;
2259
2260 if (context->dynamic_user &&
2261 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2262 s = strjoin(params->prefix[t], "/private/", *suffix);
2263 else
2264 s = strjoin(params->prefix[t], "/", *suffix);
2265 if (!s) {
2266 r = -ENOMEM;
2267 goto finish;
2268 }
2269
2270 if (context->dynamic_user &&
2271 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2272 (context->root_directory || context->root_image))
2273 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2274 * directory is not created on the root directory. So, let's bind-mount the directory
2275 * on the 'non-private' place. */
2276 d = strjoin(params->prefix[t], "/", *suffix);
2277 else
2278 d = strdup(s);
2279 if (!d) {
2280 free(s);
2281 r = -ENOMEM;
2282 goto finish;
2283 }
2284
2285 bind_mounts[h++] = (BindMount) {
2286 .source = s,
2287 .destination = d,
2288 .read_only = false,
2289 .recursive = true,
2290 .ignore_enoent = false,
2291 };
2292 }
2293 }
2294
2295 assert(h == n);
2296
2297 *ret_bind_mounts = bind_mounts;
2298 *ret_n_bind_mounts = n;
2299 *ret_empty_directories = TAKE_PTR(empty_directories);
2300
2301 return (int) n;
2302
2303 finish:
2304 bind_mount_free_many(bind_mounts, h);
2305 return r;
2306 }
2307
2308 static int apply_mount_namespace(
2309 const Unit *u,
2310 const ExecCommand *command,
2311 const ExecContext *context,
2312 const ExecParameters *params,
2313 const ExecRuntime *runtime) {
2314
2315 _cleanup_strv_free_ char **empty_directories = NULL;
2316 char *tmp = NULL, *var = NULL;
2317 const char *root_dir = NULL, *root_image = NULL;
2318 NamespaceInfo ns_info;
2319 bool needs_sandboxing;
2320 BindMount *bind_mounts = NULL;
2321 size_t n_bind_mounts = 0;
2322 int r;
2323
2324 assert(context);
2325
2326 /* The runtime struct only contains the parent of the private /tmp,
2327 * which is non-accessible to world users. Inside of it there's a /tmp
2328 * that is sticky, and that's the one we want to use here. */
2329
2330 if (context->private_tmp && runtime) {
2331 if (runtime->tmp_dir)
2332 tmp = strjoina(runtime->tmp_dir, "/tmp");
2333 if (runtime->var_tmp_dir)
2334 var = strjoina(runtime->var_tmp_dir, "/tmp");
2335 }
2336
2337 if (params->flags & EXEC_APPLY_CHROOT) {
2338 root_image = context->root_image;
2339
2340 if (!root_image)
2341 root_dir = context->root_directory;
2342 }
2343
2344 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2345 if (r < 0)
2346 return r;
2347
2348 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2349 if (needs_sandboxing)
2350 ns_info = (NamespaceInfo) {
2351 .ignore_protect_paths = false,
2352 .private_dev = context->private_devices,
2353 .protect_control_groups = context->protect_control_groups,
2354 .protect_kernel_tunables = context->protect_kernel_tunables,
2355 .protect_kernel_modules = context->protect_kernel_modules,
2356 .mount_apivfs = context->mount_apivfs,
2357 .private_mounts = context->private_mounts,
2358 };
2359 else if (!context->dynamic_user && root_dir)
2360 /*
2361 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2362 * sandbox info, otherwise enforce it, don't ignore protected paths and
2363 * fail if we are enable to apply the sandbox inside the mount namespace.
2364 */
2365 ns_info = (NamespaceInfo) {
2366 .ignore_protect_paths = true,
2367 };
2368 else
2369 ns_info = (NamespaceInfo) {};
2370
2371 r = setup_namespace(root_dir, root_image,
2372 &ns_info, context->read_write_paths,
2373 needs_sandboxing ? context->read_only_paths : NULL,
2374 needs_sandboxing ? context->inaccessible_paths : NULL,
2375 empty_directories,
2376 bind_mounts,
2377 n_bind_mounts,
2378 context->temporary_filesystems,
2379 context->n_temporary_filesystems,
2380 tmp,
2381 var,
2382 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2383 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2384 context->mount_flags,
2385 DISSECT_IMAGE_DISCARD_ON_LOOP);
2386
2387 bind_mount_free_many(bind_mounts, n_bind_mounts);
2388
2389 /* If we couldn't set up the namespace this is probably due to a
2390 * missing capability. In this case, silently proceeed. */
2391 if (IN_SET(r, -EPERM, -EACCES)) {
2392 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2393 return 0;
2394 }
2395
2396 return r;
2397 }
2398
2399 static int apply_working_directory(
2400 const ExecContext *context,
2401 const ExecParameters *params,
2402 const char *home,
2403 const bool needs_mount_ns,
2404 int *exit_status) {
2405
2406 const char *d, *wd;
2407
2408 assert(context);
2409 assert(exit_status);
2410
2411 if (context->working_directory_home) {
2412
2413 if (!home) {
2414 *exit_status = EXIT_CHDIR;
2415 return -ENXIO;
2416 }
2417
2418 wd = home;
2419
2420 } else if (context->working_directory)
2421 wd = context->working_directory;
2422 else
2423 wd = "/";
2424
2425 if (params->flags & EXEC_APPLY_CHROOT) {
2426 if (!needs_mount_ns && context->root_directory)
2427 if (chroot(context->root_directory) < 0) {
2428 *exit_status = EXIT_CHROOT;
2429 return -errno;
2430 }
2431
2432 d = wd;
2433 } else
2434 d = prefix_roota(context->root_directory, wd);
2435
2436 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2437 *exit_status = EXIT_CHDIR;
2438 return -errno;
2439 }
2440
2441 return 0;
2442 }
2443
2444 static int setup_keyring(
2445 const Unit *u,
2446 const ExecContext *context,
2447 const ExecParameters *p,
2448 uid_t uid, gid_t gid) {
2449
2450 key_serial_t keyring;
2451 int r = 0;
2452 uid_t saved_uid;
2453 gid_t saved_gid;
2454
2455 assert(u);
2456 assert(context);
2457 assert(p);
2458
2459 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2460 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2461 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2462 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2463 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2464 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2465
2466 if (!(p->flags & EXEC_NEW_KEYRING))
2467 return 0;
2468
2469 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2470 return 0;
2471
2472 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2473 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2474 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2475 * & group is just as nasty as acquiring a reference to the user keyring. */
2476
2477 saved_uid = getuid();
2478 saved_gid = getgid();
2479
2480 if (gid_is_valid(gid) && gid != saved_gid) {
2481 if (setregid(gid, -1) < 0)
2482 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2483 }
2484
2485 if (uid_is_valid(uid) && uid != saved_uid) {
2486 if (setreuid(uid, -1) < 0) {
2487 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2488 goto out;
2489 }
2490 }
2491
2492 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2493 if (keyring == -1) {
2494 if (errno == ENOSYS)
2495 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2496 else if (IN_SET(errno, EACCES, EPERM))
2497 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2498 else if (errno == EDQUOT)
2499 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2500 else
2501 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2502
2503 goto out;
2504 }
2505
2506 /* When requested link the user keyring into the session keyring. */
2507 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2508
2509 if (keyctl(KEYCTL_LINK,
2510 KEY_SPEC_USER_KEYRING,
2511 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2512 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2513 goto out;
2514 }
2515 }
2516
2517 /* Restore uid/gid back */
2518 if (uid_is_valid(uid) && uid != saved_uid) {
2519 if (setreuid(saved_uid, -1) < 0) {
2520 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2521 goto out;
2522 }
2523 }
2524
2525 if (gid_is_valid(gid) && gid != saved_gid) {
2526 if (setregid(saved_gid, -1) < 0)
2527 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2528 }
2529
2530 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2531 if (!sd_id128_is_null(u->invocation_id)) {
2532 key_serial_t key;
2533
2534 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2535 if (key == -1)
2536 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2537 else {
2538 if (keyctl(KEYCTL_SETPERM, key,
2539 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2540 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2541 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2542 }
2543 }
2544
2545 out:
2546 /* Revert back uid & gid for the the last time, and exit */
2547 /* no extra logging, as only the first already reported error matters */
2548 if (getuid() != saved_uid)
2549 (void) setreuid(saved_uid, -1);
2550
2551 if (getgid() != saved_gid)
2552 (void) setregid(saved_gid, -1);
2553
2554 return r;
2555 }
2556
2557 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2558 assert(array);
2559 assert(n);
2560
2561 if (!pair)
2562 return;
2563
2564 if (pair[0] >= 0)
2565 array[(*n)++] = pair[0];
2566 if (pair[1] >= 0)
2567 array[(*n)++] = pair[1];
2568 }
2569
2570 static int close_remaining_fds(
2571 const ExecParameters *params,
2572 const ExecRuntime *runtime,
2573 const DynamicCreds *dcreds,
2574 int user_lookup_fd,
2575 int socket_fd,
2576 int exec_fd,
2577 int *fds, size_t n_fds) {
2578
2579 size_t n_dont_close = 0;
2580 int dont_close[n_fds + 12];
2581
2582 assert(params);
2583
2584 if (params->stdin_fd >= 0)
2585 dont_close[n_dont_close++] = params->stdin_fd;
2586 if (params->stdout_fd >= 0)
2587 dont_close[n_dont_close++] = params->stdout_fd;
2588 if (params->stderr_fd >= 0)
2589 dont_close[n_dont_close++] = params->stderr_fd;
2590
2591 if (socket_fd >= 0)
2592 dont_close[n_dont_close++] = socket_fd;
2593 if (exec_fd >= 0)
2594 dont_close[n_dont_close++] = exec_fd;
2595 if (n_fds > 0) {
2596 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2597 n_dont_close += n_fds;
2598 }
2599
2600 if (runtime)
2601 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2602
2603 if (dcreds) {
2604 if (dcreds->user)
2605 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2606 if (dcreds->group)
2607 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2608 }
2609
2610 if (user_lookup_fd >= 0)
2611 dont_close[n_dont_close++] = user_lookup_fd;
2612
2613 return close_all_fds(dont_close, n_dont_close);
2614 }
2615
2616 static int send_user_lookup(
2617 Unit *unit,
2618 int user_lookup_fd,
2619 uid_t uid,
2620 gid_t gid) {
2621
2622 assert(unit);
2623
2624 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2625 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2626 * specified. */
2627
2628 if (user_lookup_fd < 0)
2629 return 0;
2630
2631 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2632 return 0;
2633
2634 if (writev(user_lookup_fd,
2635 (struct iovec[]) {
2636 IOVEC_INIT(&uid, sizeof(uid)),
2637 IOVEC_INIT(&gid, sizeof(gid)),
2638 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2639 return -errno;
2640
2641 return 0;
2642 }
2643
2644 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2645 int r;
2646
2647 assert(c);
2648 assert(home);
2649 assert(buf);
2650
2651 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2652
2653 if (*home)
2654 return 0;
2655
2656 if (!c->working_directory_home)
2657 return 0;
2658
2659 if (uid == 0) {
2660 /* Hardcode /root as home directory for UID 0 */
2661 *home = "/root";
2662 return 1;
2663 }
2664
2665 r = get_home_dir(buf);
2666 if (r < 0)
2667 return r;
2668
2669 *home = *buf;
2670 return 1;
2671 }
2672
2673 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2674 _cleanup_strv_free_ char ** list = NULL;
2675 ExecDirectoryType t;
2676 int r;
2677
2678 assert(c);
2679 assert(p);
2680 assert(ret);
2681
2682 assert(c->dynamic_user);
2683
2684 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2685 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2686 * directories. */
2687
2688 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2689 char **i;
2690
2691 if (t == EXEC_DIRECTORY_CONFIGURATION)
2692 continue;
2693
2694 if (!p->prefix[t])
2695 continue;
2696
2697 STRV_FOREACH(i, c->directories[t].paths) {
2698 char *e;
2699
2700 if (t == EXEC_DIRECTORY_RUNTIME)
2701 e = strjoin(p->prefix[t], "/", *i);
2702 else
2703 e = strjoin(p->prefix[t], "/private/", *i);
2704 if (!e)
2705 return -ENOMEM;
2706
2707 r = strv_consume(&list, e);
2708 if (r < 0)
2709 return r;
2710 }
2711 }
2712
2713 *ret = TAKE_PTR(list);
2714
2715 return 0;
2716 }
2717
2718 static char *exec_command_line(char **argv);
2719
2720 static int exec_child(
2721 Unit *unit,
2722 const ExecCommand *command,
2723 const ExecContext *context,
2724 const ExecParameters *params,
2725 ExecRuntime *runtime,
2726 DynamicCreds *dcreds,
2727 int socket_fd,
2728 int named_iofds[3],
2729 int *fds,
2730 size_t n_socket_fds,
2731 size_t n_storage_fds,
2732 char **files_env,
2733 int user_lookup_fd,
2734 int *exit_status) {
2735
2736 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2737 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2738 _cleanup_free_ gid_t *supplementary_gids = NULL;
2739 const char *username = NULL, *groupname = NULL;
2740 _cleanup_free_ char *home_buffer = NULL;
2741 const char *home = NULL, *shell = NULL;
2742 dev_t journal_stream_dev = 0;
2743 ino_t journal_stream_ino = 0;
2744 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2745 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2746 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2747 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2748 #if HAVE_SELINUX
2749 _cleanup_free_ char *mac_selinux_context_net = NULL;
2750 bool use_selinux = false;
2751 #endif
2752 #if ENABLE_SMACK
2753 bool use_smack = false;
2754 #endif
2755 #if HAVE_APPARMOR
2756 bool use_apparmor = false;
2757 #endif
2758 uid_t uid = UID_INVALID;
2759 gid_t gid = GID_INVALID;
2760 size_t n_fds;
2761 ExecDirectoryType dt;
2762 int secure_bits;
2763
2764 assert(unit);
2765 assert(command);
2766 assert(context);
2767 assert(params);
2768 assert(exit_status);
2769
2770 rename_process_from_path(command->path);
2771
2772 /* We reset exactly these signals, since they are the
2773 * only ones we set to SIG_IGN in the main daemon. All
2774 * others we leave untouched because we set them to
2775 * SIG_DFL or a valid handler initially, both of which
2776 * will be demoted to SIG_DFL. */
2777 (void) default_signals(SIGNALS_CRASH_HANDLER,
2778 SIGNALS_IGNORE, -1);
2779
2780 if (context->ignore_sigpipe)
2781 (void) ignore_signals(SIGPIPE, -1);
2782
2783 r = reset_signal_mask();
2784 if (r < 0) {
2785 *exit_status = EXIT_SIGNAL_MASK;
2786 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2787 }
2788
2789 if (params->idle_pipe)
2790 do_idle_pipe_dance(params->idle_pipe);
2791
2792 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2793 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2794 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2795 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2796
2797 log_forget_fds();
2798 log_set_open_when_needed(true);
2799
2800 /* In case anything used libc syslog(), close this here, too */
2801 closelog();
2802
2803 n_fds = n_socket_fds + n_storage_fds;
2804 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2805 if (r < 0) {
2806 *exit_status = EXIT_FDS;
2807 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2808 }
2809
2810 if (!context->same_pgrp)
2811 if (setsid() < 0) {
2812 *exit_status = EXIT_SETSID;
2813 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2814 }
2815
2816 exec_context_tty_reset(context, params);
2817
2818 if (unit_shall_confirm_spawn(unit)) {
2819 const char *vc = params->confirm_spawn;
2820 _cleanup_free_ char *cmdline = NULL;
2821
2822 cmdline = exec_command_line(command->argv);
2823 if (!cmdline) {
2824 *exit_status = EXIT_MEMORY;
2825 return log_oom();
2826 }
2827
2828 r = ask_for_confirmation(vc, unit, cmdline);
2829 if (r != CONFIRM_EXECUTE) {
2830 if (r == CONFIRM_PRETEND_SUCCESS) {
2831 *exit_status = EXIT_SUCCESS;
2832 return 0;
2833 }
2834 *exit_status = EXIT_CONFIRM;
2835 log_unit_error(unit, "Execution cancelled by the user");
2836 return -ECANCELED;
2837 }
2838 }
2839
2840 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2841 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2842 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2843 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2844 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2845 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
2846 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
2847 *exit_status = EXIT_MEMORY;
2848 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2849 }
2850
2851 if (context->dynamic_user && dcreds) {
2852 _cleanup_strv_free_ char **suggested_paths = NULL;
2853
2854 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2855 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2856 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2857 *exit_status = EXIT_USER;
2858 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2859 }
2860
2861 r = compile_suggested_paths(context, params, &suggested_paths);
2862 if (r < 0) {
2863 *exit_status = EXIT_MEMORY;
2864 return log_oom();
2865 }
2866
2867 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2868 if (r < 0) {
2869 *exit_status = EXIT_USER;
2870 if (r == -EILSEQ) {
2871 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2872 return -EOPNOTSUPP;
2873 }
2874 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2875 }
2876
2877 if (!uid_is_valid(uid)) {
2878 *exit_status = EXIT_USER;
2879 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2880 return -ESRCH;
2881 }
2882
2883 if (!gid_is_valid(gid)) {
2884 *exit_status = EXIT_USER;
2885 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2886 return -ESRCH;
2887 }
2888
2889 if (dcreds->user)
2890 username = dcreds->user->name;
2891
2892 } else {
2893 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2894 if (r < 0) {
2895 *exit_status = EXIT_USER;
2896 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2897 }
2898
2899 r = get_fixed_group(context, &groupname, &gid);
2900 if (r < 0) {
2901 *exit_status = EXIT_GROUP;
2902 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2903 }
2904 }
2905
2906 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2907 r = get_supplementary_groups(context, username, groupname, gid,
2908 &supplementary_gids, &ngids);
2909 if (r < 0) {
2910 *exit_status = EXIT_GROUP;
2911 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2912 }
2913
2914 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2915 if (r < 0) {
2916 *exit_status = EXIT_USER;
2917 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2918 }
2919
2920 user_lookup_fd = safe_close(user_lookup_fd);
2921
2922 r = acquire_home(context, uid, &home, &home_buffer);
2923 if (r < 0) {
2924 *exit_status = EXIT_CHDIR;
2925 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2926 }
2927
2928 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2929 * must sure to drop O_NONBLOCK */
2930 if (socket_fd >= 0)
2931 (void) fd_nonblock(socket_fd, false);
2932
2933 r = setup_input(context, params, socket_fd, named_iofds);
2934 if (r < 0) {
2935 *exit_status = EXIT_STDIN;
2936 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2937 }
2938
2939 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2940 if (r < 0) {
2941 *exit_status = EXIT_STDOUT;
2942 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2943 }
2944
2945 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2946 if (r < 0) {
2947 *exit_status = EXIT_STDERR;
2948 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2949 }
2950
2951 if (params->cgroup_path) {
2952 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2953 if (r < 0) {
2954 *exit_status = EXIT_CGROUP;
2955 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2956 }
2957 }
2958
2959 if (context->oom_score_adjust_set) {
2960 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
2961 * prohibit write access to this file, and we shouldn't trip up over that. */
2962 r = set_oom_score_adjust(context->oom_score_adjust);
2963 if (IN_SET(r, -EPERM, -EACCES))
2964 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2965 else if (r < 0) {
2966 *exit_status = EXIT_OOM_ADJUST;
2967 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2968 }
2969 }
2970
2971 if (context->nice_set)
2972 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2973 *exit_status = EXIT_NICE;
2974 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2975 }
2976
2977 if (context->cpu_sched_set) {
2978 struct sched_param param = {
2979 .sched_priority = context->cpu_sched_priority,
2980 };
2981
2982 r = sched_setscheduler(0,
2983 context->cpu_sched_policy |
2984 (context->cpu_sched_reset_on_fork ?
2985 SCHED_RESET_ON_FORK : 0),
2986 &param);
2987 if (r < 0) {
2988 *exit_status = EXIT_SETSCHEDULER;
2989 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2990 }
2991 }
2992
2993 if (context->cpuset)
2994 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2995 *exit_status = EXIT_CPUAFFINITY;
2996 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2997 }
2998
2999 if (context->ioprio_set)
3000 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3001 *exit_status = EXIT_IOPRIO;
3002 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3003 }
3004
3005 if (context->timer_slack_nsec != NSEC_INFINITY)
3006 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3007 *exit_status = EXIT_TIMERSLACK;
3008 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3009 }
3010
3011 if (context->personality != PERSONALITY_INVALID) {
3012 r = safe_personality(context->personality);
3013 if (r < 0) {
3014 *exit_status = EXIT_PERSONALITY;
3015 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3016 }
3017 }
3018
3019 if (context->utmp_id)
3020 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3021 context->tty_path,
3022 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3023 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3024 USER_PROCESS,
3025 username);
3026
3027 if (context->user) {
3028 r = chown_terminal(STDIN_FILENO, uid);
3029 if (r < 0) {
3030 *exit_status = EXIT_STDIN;
3031 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3032 }
3033 }
3034
3035 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3036 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3037 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3038 * touch a single hierarchy too. */
3039 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3040 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3041 if (r < 0) {
3042 *exit_status = EXIT_CGROUP;
3043 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3044 }
3045 }
3046
3047 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3048 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3049 if (r < 0)
3050 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3051 }
3052
3053 r = build_environment(
3054 unit,
3055 context,
3056 params,
3057 n_fds,
3058 home,
3059 username,
3060 shell,
3061 journal_stream_dev,
3062 journal_stream_ino,
3063 &our_env);
3064 if (r < 0) {
3065 *exit_status = EXIT_MEMORY;
3066 return log_oom();
3067 }
3068
3069 r = build_pass_environment(context, &pass_env);
3070 if (r < 0) {
3071 *exit_status = EXIT_MEMORY;
3072 return log_oom();
3073 }
3074
3075 accum_env = strv_env_merge(5,
3076 params->environment,
3077 our_env,
3078 pass_env,
3079 context->environment,
3080 files_env,
3081 NULL);
3082 if (!accum_env) {
3083 *exit_status = EXIT_MEMORY;
3084 return log_oom();
3085 }
3086 accum_env = strv_env_clean(accum_env);
3087
3088 (void) umask(context->umask);
3089
3090 r = setup_keyring(unit, context, params, uid, gid);
3091 if (r < 0) {
3092 *exit_status = EXIT_KEYRING;
3093 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3094 }
3095
3096 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3097 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3098
3099 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3100 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3101
3102 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3103 if (needs_ambient_hack)
3104 needs_setuid = false;
3105 else
3106 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3107
3108 if (needs_sandboxing) {
3109 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3110 * present. The actual MAC context application will happen later, as late as possible, to avoid
3111 * impacting our own code paths. */
3112
3113 #if HAVE_SELINUX
3114 use_selinux = mac_selinux_use();
3115 #endif
3116 #if ENABLE_SMACK
3117 use_smack = mac_smack_use();
3118 #endif
3119 #if HAVE_APPARMOR
3120 use_apparmor = mac_apparmor_use();
3121 #endif
3122 }
3123
3124 if (needs_setuid) {
3125 if (context->pam_name && username) {
3126 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3127 if (r < 0) {
3128 *exit_status = EXIT_PAM;
3129 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3130 }
3131 }
3132 }
3133
3134 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3135 if (ns_type_supported(NAMESPACE_NET)) {
3136 r = setup_netns(runtime->netns_storage_socket);
3137 if (r < 0) {
3138 *exit_status = EXIT_NETWORK;
3139 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3140 }
3141 } else
3142 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3143 }
3144
3145 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3146 if (needs_mount_namespace) {
3147 r = apply_mount_namespace(unit, command, context, params, runtime);
3148 if (r < 0) {
3149 *exit_status = EXIT_NAMESPACE;
3150 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3151 }
3152 }
3153
3154 /* Apply just after mount namespace setup */
3155 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3156 if (r < 0)
3157 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3158
3159 /* Drop groups as early as possbile */
3160 if (needs_setuid) {
3161 r = enforce_groups(gid, supplementary_gids, ngids);
3162 if (r < 0) {
3163 *exit_status = EXIT_GROUP;
3164 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3165 }
3166 }
3167
3168 if (needs_sandboxing) {
3169 #if HAVE_SELINUX
3170 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3171 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3172 if (r < 0) {
3173 *exit_status = EXIT_SELINUX_CONTEXT;
3174 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3175 }
3176 }
3177 #endif
3178
3179 if (context->private_users) {
3180 r = setup_private_users(uid, gid);
3181 if (r < 0) {
3182 *exit_status = EXIT_USER;
3183 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3184 }
3185 }
3186 }
3187
3188 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3189 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3190 * however if we have it as we want to keep it open until the final execve(). */
3191
3192 if (params->exec_fd >= 0) {
3193 exec_fd = params->exec_fd;
3194
3195 if (exec_fd < 3 + (int) n_fds) {
3196 int moved_fd;
3197
3198 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3199 * process we are about to execute. */
3200
3201 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3202 if (moved_fd < 0) {
3203 *exit_status = EXIT_FDS;
3204 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3205 }
3206
3207 safe_close(exec_fd);
3208 exec_fd = moved_fd;
3209 } else {
3210 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3211 r = fd_cloexec(exec_fd, true);
3212 if (r < 0) {
3213 *exit_status = EXIT_FDS;
3214 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3215 }
3216 }
3217
3218 fds_with_exec_fd = newa(int, n_fds + 1);
3219 memcpy(fds_with_exec_fd, fds, n_fds * sizeof(int));
3220 fds_with_exec_fd[n_fds] = exec_fd;
3221 n_fds_with_exec_fd = n_fds + 1;
3222 } else {
3223 fds_with_exec_fd = fds;
3224 n_fds_with_exec_fd = n_fds;
3225 }
3226
3227 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3228 if (r >= 0)
3229 r = shift_fds(fds, n_fds);
3230 if (r >= 0)
3231 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3232 if (r < 0) {
3233 *exit_status = EXIT_FDS;
3234 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3235 }
3236
3237 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3238 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3239 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3240 * came this far. */
3241
3242 secure_bits = context->secure_bits;
3243
3244 if (needs_sandboxing) {
3245 uint64_t bset;
3246 int which_failed;
3247
3248 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3249 if (r < 0) {
3250 *exit_status = EXIT_LIMITS;
3251 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3252 }
3253
3254 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3255 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3256 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3257 *exit_status = EXIT_LIMITS;
3258 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3259 }
3260 }
3261
3262 #if ENABLE_SMACK
3263 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3264 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3265 if (use_smack) {
3266 r = setup_smack(context, command);
3267 if (r < 0) {
3268 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3269 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3270 }
3271 }
3272 #endif
3273
3274 bset = context->capability_bounding_set;
3275 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3276 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3277 * instead of us doing that */
3278 if (needs_ambient_hack)
3279 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3280 (UINT64_C(1) << CAP_SETUID) |
3281 (UINT64_C(1) << CAP_SETGID);
3282
3283 if (!cap_test_all(bset)) {
3284 r = capability_bounding_set_drop(bset, false);
3285 if (r < 0) {
3286 *exit_status = EXIT_CAPABILITIES;
3287 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3288 }
3289 }
3290
3291 /* This is done before enforce_user, but ambient set
3292 * does not survive over setresuid() if keep_caps is not set. */
3293 if (!needs_ambient_hack &&
3294 context->capability_ambient_set != 0) {
3295 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3296 if (r < 0) {
3297 *exit_status = EXIT_CAPABILITIES;
3298 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3299 }
3300 }
3301 }
3302
3303 if (needs_setuid) {
3304 if (context->user) {
3305 r = enforce_user(context, uid);
3306 if (r < 0) {
3307 *exit_status = EXIT_USER;
3308 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3309 }
3310
3311 if (!needs_ambient_hack &&
3312 context->capability_ambient_set != 0) {
3313
3314 /* Fix the ambient capabilities after user change. */
3315 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3316 if (r < 0) {
3317 *exit_status = EXIT_CAPABILITIES;
3318 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3319 }
3320
3321 /* If we were asked to change user and ambient capabilities
3322 * were requested, we had to add keep-caps to the securebits
3323 * so that we would maintain the inherited capability set
3324 * through the setresuid(). Make sure that the bit is added
3325 * also to the context secure_bits so that we don't try to
3326 * drop the bit away next. */
3327
3328 secure_bits |= 1<<SECURE_KEEP_CAPS;
3329 }
3330 }
3331 }
3332
3333 if (needs_sandboxing) {
3334 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3335 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3336 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3337 * are restricted. */
3338
3339 #if HAVE_SELINUX
3340 if (use_selinux) {
3341 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3342
3343 if (exec_context) {
3344 r = setexeccon(exec_context);
3345 if (r < 0) {
3346 *exit_status = EXIT_SELINUX_CONTEXT;
3347 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3348 }
3349 }
3350 }
3351 #endif
3352
3353 #if HAVE_APPARMOR
3354 if (use_apparmor && context->apparmor_profile) {
3355 r = aa_change_onexec(context->apparmor_profile);
3356 if (r < 0 && !context->apparmor_profile_ignore) {
3357 *exit_status = EXIT_APPARMOR_PROFILE;
3358 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3359 }
3360 }
3361 #endif
3362
3363 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3364 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3365 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3366 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3367 *exit_status = EXIT_SECUREBITS;
3368 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3369 }
3370
3371 if (context_has_no_new_privileges(context))
3372 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3373 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3374 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3375 }
3376
3377 #if HAVE_SECCOMP
3378 r = apply_address_families(unit, context);
3379 if (r < 0) {
3380 *exit_status = EXIT_ADDRESS_FAMILIES;
3381 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3382 }
3383
3384 r = apply_memory_deny_write_execute(unit, context);
3385 if (r < 0) {
3386 *exit_status = EXIT_SECCOMP;
3387 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3388 }
3389
3390 r = apply_restrict_realtime(unit, context);
3391 if (r < 0) {
3392 *exit_status = EXIT_SECCOMP;
3393 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3394 }
3395
3396 r = apply_restrict_namespaces(unit, context);
3397 if (r < 0) {
3398 *exit_status = EXIT_SECCOMP;
3399 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3400 }
3401
3402 r = apply_protect_sysctl(unit, context);
3403 if (r < 0) {
3404 *exit_status = EXIT_SECCOMP;
3405 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3406 }
3407
3408 r = apply_protect_kernel_modules(unit, context);
3409 if (r < 0) {
3410 *exit_status = EXIT_SECCOMP;
3411 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3412 }
3413
3414 r = apply_private_devices(unit, context);
3415 if (r < 0) {
3416 *exit_status = EXIT_SECCOMP;
3417 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3418 }
3419
3420 r = apply_syscall_archs(unit, context);
3421 if (r < 0) {
3422 *exit_status = EXIT_SECCOMP;
3423 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3424 }
3425
3426 r = apply_lock_personality(unit, context);
3427 if (r < 0) {
3428 *exit_status = EXIT_SECCOMP;
3429 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3430 }
3431
3432 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3433 * by the filter as little as possible. */
3434 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3435 if (r < 0) {
3436 *exit_status = EXIT_SECCOMP;
3437 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3438 }
3439 #endif
3440 }
3441
3442 if (!strv_isempty(context->unset_environment)) {
3443 char **ee = NULL;
3444
3445 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3446 if (!ee) {
3447 *exit_status = EXIT_MEMORY;
3448 return log_oom();
3449 }
3450
3451 strv_free_and_replace(accum_env, ee);
3452 }
3453
3454 final_argv = replace_env_argv(command->argv, accum_env);
3455 if (!final_argv) {
3456 *exit_status = EXIT_MEMORY;
3457 return log_oom();
3458 }
3459
3460 if (DEBUG_LOGGING) {
3461 _cleanup_free_ char *line;
3462
3463 line = exec_command_line(final_argv);
3464 if (line)
3465 log_struct(LOG_DEBUG,
3466 "EXECUTABLE=%s", command->path,
3467 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3468 LOG_UNIT_ID(unit),
3469 LOG_UNIT_INVOCATION_ID(unit));
3470 }
3471
3472 if (exec_fd >= 0) {
3473 uint8_t hot = 1;
3474
3475 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3476 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3477
3478 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3479 *exit_status = EXIT_EXEC;
3480 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3481 }
3482 }
3483
3484 execve(command->path, final_argv, accum_env);
3485 r = -errno;
3486
3487 if (exec_fd >= 0) {
3488 uint8_t hot = 0;
3489
3490 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3491 * that POLLHUP on it no longer means execve() succeeded. */
3492
3493 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3494 *exit_status = EXIT_EXEC;
3495 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3496 }
3497 }
3498
3499 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3500 log_struct_errno(LOG_INFO, r,
3501 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3502 LOG_UNIT_ID(unit),
3503 LOG_UNIT_INVOCATION_ID(unit),
3504 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3505 command->path),
3506 "EXECUTABLE=%s", command->path);
3507 return 0;
3508 }
3509
3510 *exit_status = EXIT_EXEC;
3511 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3512 }
3513
3514 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3515 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3516
3517 int exec_spawn(Unit *unit,
3518 ExecCommand *command,
3519 const ExecContext *context,
3520 const ExecParameters *params,
3521 ExecRuntime *runtime,
3522 DynamicCreds *dcreds,
3523 pid_t *ret) {
3524
3525 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3526 _cleanup_strv_free_ char **files_env = NULL;
3527 size_t n_storage_fds = 0, n_socket_fds = 0;
3528 _cleanup_free_ char *line = NULL;
3529 pid_t pid;
3530
3531 assert(unit);
3532 assert(command);
3533 assert(context);
3534 assert(ret);
3535 assert(params);
3536 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3537
3538 if (context->std_input == EXEC_INPUT_SOCKET ||
3539 context->std_output == EXEC_OUTPUT_SOCKET ||
3540 context->std_error == EXEC_OUTPUT_SOCKET) {
3541
3542 if (params->n_socket_fds > 1) {
3543 log_unit_error(unit, "Got more than one socket.");
3544 return -EINVAL;
3545 }
3546
3547 if (params->n_socket_fds == 0) {
3548 log_unit_error(unit, "Got no socket.");
3549 return -EINVAL;
3550 }
3551
3552 socket_fd = params->fds[0];
3553 } else {
3554 socket_fd = -1;
3555 fds = params->fds;
3556 n_socket_fds = params->n_socket_fds;
3557 n_storage_fds = params->n_storage_fds;
3558 }
3559
3560 r = exec_context_named_iofds(context, params, named_iofds);
3561 if (r < 0)
3562 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3563
3564 r = exec_context_load_environment(unit, context, &files_env);
3565 if (r < 0)
3566 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3567
3568 line = exec_command_line(command->argv);
3569 if (!line)
3570 return log_oom();
3571
3572 log_struct(LOG_DEBUG,
3573 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3574 "EXECUTABLE=%s", command->path,
3575 LOG_UNIT_ID(unit),
3576 LOG_UNIT_INVOCATION_ID(unit));
3577
3578 pid = fork();
3579 if (pid < 0)
3580 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3581
3582 if (pid == 0) {
3583 int exit_status = EXIT_SUCCESS;
3584
3585 r = exec_child(unit,
3586 command,
3587 context,
3588 params,
3589 runtime,
3590 dcreds,
3591 socket_fd,
3592 named_iofds,
3593 fds,
3594 n_socket_fds,
3595 n_storage_fds,
3596 files_env,
3597 unit->manager->user_lookup_fds[1],
3598 &exit_status);
3599
3600 if (r < 0)
3601 log_struct_errno(LOG_ERR, r,
3602 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3603 LOG_UNIT_ID(unit),
3604 LOG_UNIT_INVOCATION_ID(unit),
3605 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3606 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3607 command->path),
3608 "EXECUTABLE=%s", command->path);
3609
3610 _exit(exit_status);
3611 }
3612
3613 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3614
3615 /* We add the new process to the cgroup both in the child (so
3616 * that we can be sure that no user code is ever executed
3617 * outside of the cgroup) and in the parent (so that we can be
3618 * sure that when we kill the cgroup the process will be
3619 * killed too). */
3620 if (params->cgroup_path)
3621 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3622
3623 exec_status_start(&command->exec_status, pid);
3624
3625 *ret = pid;
3626 return 0;
3627 }
3628
3629 void exec_context_init(ExecContext *c) {
3630 ExecDirectoryType i;
3631
3632 assert(c);
3633
3634 c->umask = 0022;
3635 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3636 c->cpu_sched_policy = SCHED_OTHER;
3637 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3638 c->syslog_level_prefix = true;
3639 c->ignore_sigpipe = true;
3640 c->timer_slack_nsec = NSEC_INFINITY;
3641 c->personality = PERSONALITY_INVALID;
3642 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3643 c->directories[i].mode = 0755;
3644 c->capability_bounding_set = CAP_ALL;
3645 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3646 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3647 c->log_level_max = -1;
3648 }
3649
3650 void exec_context_done(ExecContext *c) {
3651 ExecDirectoryType i;
3652 size_t l;
3653
3654 assert(c);
3655
3656 c->environment = strv_free(c->environment);
3657 c->environment_files = strv_free(c->environment_files);
3658 c->pass_environment = strv_free(c->pass_environment);
3659 c->unset_environment = strv_free(c->unset_environment);
3660
3661 rlimit_free_all(c->rlimit);
3662
3663 for (l = 0; l < 3; l++) {
3664 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3665 c->stdio_file[l] = mfree(c->stdio_file[l]);
3666 }
3667
3668 c->working_directory = mfree(c->working_directory);
3669 c->root_directory = mfree(c->root_directory);
3670 c->root_image = mfree(c->root_image);
3671 c->tty_path = mfree(c->tty_path);
3672 c->syslog_identifier = mfree(c->syslog_identifier);
3673 c->user = mfree(c->user);
3674 c->group = mfree(c->group);
3675
3676 c->supplementary_groups = strv_free(c->supplementary_groups);
3677
3678 c->pam_name = mfree(c->pam_name);
3679
3680 c->read_only_paths = strv_free(c->read_only_paths);
3681 c->read_write_paths = strv_free(c->read_write_paths);
3682 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3683
3684 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3685 c->bind_mounts = NULL;
3686 c->n_bind_mounts = 0;
3687 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3688 c->temporary_filesystems = NULL;
3689 c->n_temporary_filesystems = 0;
3690
3691 c->cpuset = cpu_set_mfree(c->cpuset);
3692
3693 c->utmp_id = mfree(c->utmp_id);
3694 c->selinux_context = mfree(c->selinux_context);
3695 c->apparmor_profile = mfree(c->apparmor_profile);
3696 c->smack_process_label = mfree(c->smack_process_label);
3697
3698 c->syscall_filter = hashmap_free(c->syscall_filter);
3699 c->syscall_archs = set_free(c->syscall_archs);
3700 c->address_families = set_free(c->address_families);
3701
3702 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3703 c->directories[i].paths = strv_free(c->directories[i].paths);
3704
3705 c->log_level_max = -1;
3706
3707 exec_context_free_log_extra_fields(c);
3708
3709 c->stdin_data = mfree(c->stdin_data);
3710 c->stdin_data_size = 0;
3711 }
3712
3713 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3714 char **i;
3715
3716 assert(c);
3717
3718 if (!runtime_prefix)
3719 return 0;
3720
3721 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3722 _cleanup_free_ char *p;
3723
3724 p = strjoin(runtime_prefix, "/", *i);
3725 if (!p)
3726 return -ENOMEM;
3727
3728 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3729 * next. */
3730 (void) rm_rf(p, REMOVE_ROOT);
3731 }
3732
3733 return 0;
3734 }
3735
3736 static void exec_command_done(ExecCommand *c) {
3737 assert(c);
3738
3739 c->path = mfree(c->path);
3740 c->argv = strv_free(c->argv);
3741 }
3742
3743 void exec_command_done_array(ExecCommand *c, size_t n) {
3744 size_t i;
3745
3746 for (i = 0; i < n; i++)
3747 exec_command_done(c+i);
3748 }
3749
3750 ExecCommand* exec_command_free_list(ExecCommand *c) {
3751 ExecCommand *i;
3752
3753 while ((i = c)) {
3754 LIST_REMOVE(command, c, i);
3755 exec_command_done(i);
3756 free(i);
3757 }
3758
3759 return NULL;
3760 }
3761
3762 void exec_command_free_array(ExecCommand **c, size_t n) {
3763 size_t i;
3764
3765 for (i = 0; i < n; i++)
3766 c[i] = exec_command_free_list(c[i]);
3767 }
3768
3769 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
3770 size_t i;
3771
3772 for (i = 0; i < n; i++)
3773 exec_status_reset(&c[i].exec_status);
3774 }
3775
3776 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
3777 size_t i;
3778
3779 for (i = 0; i < n; i++) {
3780 ExecCommand *z;
3781
3782 LIST_FOREACH(command, z, c[i])
3783 exec_status_reset(&z->exec_status);
3784 }
3785 }
3786
3787 typedef struct InvalidEnvInfo {
3788 const Unit *unit;
3789 const char *path;
3790 } InvalidEnvInfo;
3791
3792 static void invalid_env(const char *p, void *userdata) {
3793 InvalidEnvInfo *info = userdata;
3794
3795 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3796 }
3797
3798 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3799 assert(c);
3800
3801 switch (fd_index) {
3802
3803 case STDIN_FILENO:
3804 if (c->std_input != EXEC_INPUT_NAMED_FD)
3805 return NULL;
3806
3807 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3808
3809 case STDOUT_FILENO:
3810 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3811 return NULL;
3812
3813 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3814
3815 case STDERR_FILENO:
3816 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3817 return NULL;
3818
3819 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3820
3821 default:
3822 return NULL;
3823 }
3824 }
3825
3826 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3827 size_t i, targets;
3828 const char* stdio_fdname[3];
3829 size_t n_fds;
3830
3831 assert(c);
3832 assert(p);
3833
3834 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3835 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3836 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3837
3838 for (i = 0; i < 3; i++)
3839 stdio_fdname[i] = exec_context_fdname(c, i);
3840
3841 n_fds = p->n_storage_fds + p->n_socket_fds;
3842
3843 for (i = 0; i < n_fds && targets > 0; i++)
3844 if (named_iofds[STDIN_FILENO] < 0 &&
3845 c->std_input == EXEC_INPUT_NAMED_FD &&
3846 stdio_fdname[STDIN_FILENO] &&
3847 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3848
3849 named_iofds[STDIN_FILENO] = p->fds[i];
3850 targets--;
3851
3852 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3853 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3854 stdio_fdname[STDOUT_FILENO] &&
3855 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3856
3857 named_iofds[STDOUT_FILENO] = p->fds[i];
3858 targets--;
3859
3860 } else if (named_iofds[STDERR_FILENO] < 0 &&
3861 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3862 stdio_fdname[STDERR_FILENO] &&
3863 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3864
3865 named_iofds[STDERR_FILENO] = p->fds[i];
3866 targets--;
3867 }
3868
3869 return targets == 0 ? 0 : -ENOENT;
3870 }
3871
3872 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3873 char **i, **r = NULL;
3874
3875 assert(c);
3876 assert(l);
3877
3878 STRV_FOREACH(i, c->environment_files) {
3879 char *fn;
3880 int k;
3881 unsigned n;
3882 bool ignore = false;
3883 char **p;
3884 _cleanup_globfree_ glob_t pglob = {};
3885
3886 fn = *i;
3887
3888 if (fn[0] == '-') {
3889 ignore = true;
3890 fn++;
3891 }
3892
3893 if (!path_is_absolute(fn)) {
3894 if (ignore)
3895 continue;
3896
3897 strv_free(r);
3898 return -EINVAL;
3899 }
3900
3901 /* Filename supports globbing, take all matching files */
3902 k = safe_glob(fn, 0, &pglob);
3903 if (k < 0) {
3904 if (ignore)
3905 continue;
3906
3907 strv_free(r);
3908 return k;
3909 }
3910
3911 /* When we don't match anything, -ENOENT should be returned */
3912 assert(pglob.gl_pathc > 0);
3913
3914 for (n = 0; n < pglob.gl_pathc; n++) {
3915 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3916 if (k < 0) {
3917 if (ignore)
3918 continue;
3919
3920 strv_free(r);
3921 return k;
3922 }
3923 /* Log invalid environment variables with filename */
3924 if (p) {
3925 InvalidEnvInfo info = {
3926 .unit = unit,
3927 .path = pglob.gl_pathv[n]
3928 };
3929
3930 p = strv_env_clean_with_callback(p, invalid_env, &info);
3931 }
3932
3933 if (!r)
3934 r = p;
3935 else {
3936 char **m;
3937
3938 m = strv_env_merge(2, r, p);
3939 strv_free(r);
3940 strv_free(p);
3941 if (!m)
3942 return -ENOMEM;
3943
3944 r = m;
3945 }
3946 }
3947 }
3948
3949 *l = r;
3950
3951 return 0;
3952 }
3953
3954 static bool tty_may_match_dev_console(const char *tty) {
3955 _cleanup_free_ char *resolved = NULL;
3956
3957 if (!tty)
3958 return true;
3959
3960 tty = skip_dev_prefix(tty);
3961
3962 /* trivial identity? */
3963 if (streq(tty, "console"))
3964 return true;
3965
3966 if (resolve_dev_console(&resolved) < 0)
3967 return true; /* if we could not resolve, assume it may */
3968
3969 /* "tty0" means the active VC, so it may be the same sometimes */
3970 return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
3971 }
3972
3973 bool exec_context_may_touch_console(const ExecContext *ec) {
3974
3975 return (ec->tty_reset ||
3976 ec->tty_vhangup ||
3977 ec->tty_vt_disallocate ||
3978 is_terminal_input(ec->std_input) ||
3979 is_terminal_output(ec->std_output) ||
3980 is_terminal_output(ec->std_error)) &&
3981 tty_may_match_dev_console(exec_context_tty_path(ec));
3982 }
3983
3984 static void strv_fprintf(FILE *f, char **l) {
3985 char **g;
3986
3987 assert(f);
3988
3989 STRV_FOREACH(g, l)
3990 fprintf(f, " %s", *g);
3991 }
3992
3993 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
3994 ExecDirectoryType dt;
3995 char **e, **d;
3996 unsigned i;
3997 int r;
3998
3999 assert(c);
4000 assert(f);
4001
4002 prefix = strempty(prefix);
4003
4004 fprintf(f,
4005 "%sUMask: %04o\n"
4006 "%sWorkingDirectory: %s\n"
4007 "%sRootDirectory: %s\n"
4008 "%sNonBlocking: %s\n"
4009 "%sPrivateTmp: %s\n"
4010 "%sPrivateDevices: %s\n"
4011 "%sProtectKernelTunables: %s\n"
4012 "%sProtectKernelModules: %s\n"
4013 "%sProtectControlGroups: %s\n"
4014 "%sPrivateNetwork: %s\n"
4015 "%sPrivateUsers: %s\n"
4016 "%sProtectHome: %s\n"
4017 "%sProtectSystem: %s\n"
4018 "%sMountAPIVFS: %s\n"
4019 "%sIgnoreSIGPIPE: %s\n"
4020 "%sMemoryDenyWriteExecute: %s\n"
4021 "%sRestrictRealtime: %s\n"
4022 "%sKeyringMode: %s\n",
4023 prefix, c->umask,
4024 prefix, c->working_directory ? c->working_directory : "/",
4025 prefix, c->root_directory ? c->root_directory : "/",
4026 prefix, yes_no(c->non_blocking),
4027 prefix, yes_no(c->private_tmp),
4028 prefix, yes_no(c->private_devices),
4029 prefix, yes_no(c->protect_kernel_tunables),
4030 prefix, yes_no(c->protect_kernel_modules),
4031 prefix, yes_no(c->protect_control_groups),
4032 prefix, yes_no(c->private_network),
4033 prefix, yes_no(c->private_users),
4034 prefix, protect_home_to_string(c->protect_home),
4035 prefix, protect_system_to_string(c->protect_system),
4036 prefix, yes_no(c->mount_apivfs),
4037 prefix, yes_no(c->ignore_sigpipe),
4038 prefix, yes_no(c->memory_deny_write_execute),
4039 prefix, yes_no(c->restrict_realtime),
4040 prefix, exec_keyring_mode_to_string(c->keyring_mode));
4041
4042 if (c->root_image)
4043 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4044
4045 STRV_FOREACH(e, c->environment)
4046 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4047
4048 STRV_FOREACH(e, c->environment_files)
4049 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4050
4051 STRV_FOREACH(e, c->pass_environment)
4052 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4053
4054 STRV_FOREACH(e, c->unset_environment)
4055 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4056
4057 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4058
4059 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4060 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4061
4062 STRV_FOREACH(d, c->directories[dt].paths)
4063 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4064 }
4065
4066 if (c->nice_set)
4067 fprintf(f,
4068 "%sNice: %i\n",
4069 prefix, c->nice);
4070
4071 if (c->oom_score_adjust_set)
4072 fprintf(f,
4073 "%sOOMScoreAdjust: %i\n",
4074 prefix, c->oom_score_adjust);
4075
4076 for (i = 0; i < RLIM_NLIMITS; i++)
4077 if (c->rlimit[i]) {
4078 fprintf(f, "Limit%s%s: " RLIM_FMT "\n",
4079 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4080 fprintf(f, "Limit%s%sSoft: " RLIM_FMT "\n",
4081 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4082 }
4083
4084 if (c->ioprio_set) {
4085 _cleanup_free_ char *class_str = NULL;
4086
4087 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4088 if (r >= 0)
4089 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4090
4091 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4092 }
4093
4094 if (c->cpu_sched_set) {
4095 _cleanup_free_ char *policy_str = NULL;
4096
4097 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4098 if (r >= 0)
4099 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4100
4101 fprintf(f,
4102 "%sCPUSchedulingPriority: %i\n"
4103 "%sCPUSchedulingResetOnFork: %s\n",
4104 prefix, c->cpu_sched_priority,
4105 prefix, yes_no(c->cpu_sched_reset_on_fork));
4106 }
4107
4108 if (c->cpuset) {
4109 fprintf(f, "%sCPUAffinity:", prefix);
4110 for (i = 0; i < c->cpuset_ncpus; i++)
4111 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4112 fprintf(f, " %u", i);
4113 fputs("\n", f);
4114 }
4115
4116 if (c->timer_slack_nsec != NSEC_INFINITY)
4117 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4118
4119 fprintf(f,
4120 "%sStandardInput: %s\n"
4121 "%sStandardOutput: %s\n"
4122 "%sStandardError: %s\n",
4123 prefix, exec_input_to_string(c->std_input),
4124 prefix, exec_output_to_string(c->std_output),
4125 prefix, exec_output_to_string(c->std_error));
4126
4127 if (c->std_input == EXEC_INPUT_NAMED_FD)
4128 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4129 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4130 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4131 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4132 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4133
4134 if (c->std_input == EXEC_INPUT_FILE)
4135 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4136 if (c->std_output == EXEC_OUTPUT_FILE)
4137 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4138 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4139 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4140 if (c->std_error == EXEC_OUTPUT_FILE)
4141 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4142 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4143 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4144
4145 if (c->tty_path)
4146 fprintf(f,
4147 "%sTTYPath: %s\n"
4148 "%sTTYReset: %s\n"
4149 "%sTTYVHangup: %s\n"
4150 "%sTTYVTDisallocate: %s\n",
4151 prefix, c->tty_path,
4152 prefix, yes_no(c->tty_reset),
4153 prefix, yes_no(c->tty_vhangup),
4154 prefix, yes_no(c->tty_vt_disallocate));
4155
4156 if (IN_SET(c->std_output,
4157 EXEC_OUTPUT_SYSLOG,
4158 EXEC_OUTPUT_KMSG,
4159 EXEC_OUTPUT_JOURNAL,
4160 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4161 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4162 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4163 IN_SET(c->std_error,
4164 EXEC_OUTPUT_SYSLOG,
4165 EXEC_OUTPUT_KMSG,
4166 EXEC_OUTPUT_JOURNAL,
4167 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4168 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4169 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4170
4171 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4172
4173 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4174 if (r >= 0)
4175 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4176
4177 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4178 if (r >= 0)
4179 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4180 }
4181
4182 if (c->log_level_max >= 0) {
4183 _cleanup_free_ char *t = NULL;
4184
4185 (void) log_level_to_string_alloc(c->log_level_max, &t);
4186
4187 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4188 }
4189
4190 if (c->n_log_extra_fields > 0) {
4191 size_t j;
4192
4193 for (j = 0; j < c->n_log_extra_fields; j++) {
4194 fprintf(f, "%sLogExtraFields: ", prefix);
4195 fwrite(c->log_extra_fields[j].iov_base,
4196 1, c->log_extra_fields[j].iov_len,
4197 f);
4198 fputc('\n', f);
4199 }
4200 }
4201
4202 if (c->secure_bits) {
4203 _cleanup_free_ char *str = NULL;
4204
4205 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4206 if (r >= 0)
4207 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4208 }
4209
4210 if (c->capability_bounding_set != CAP_ALL) {
4211 _cleanup_free_ char *str = NULL;
4212
4213 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4214 if (r >= 0)
4215 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4216 }
4217
4218 if (c->capability_ambient_set != 0) {
4219 _cleanup_free_ char *str = NULL;
4220
4221 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4222 if (r >= 0)
4223 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4224 }
4225
4226 if (c->user)
4227 fprintf(f, "%sUser: %s\n", prefix, c->user);
4228 if (c->group)
4229 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4230
4231 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4232
4233 if (!strv_isempty(c->supplementary_groups)) {
4234 fprintf(f, "%sSupplementaryGroups:", prefix);
4235 strv_fprintf(f, c->supplementary_groups);
4236 fputs("\n", f);
4237 }
4238
4239 if (c->pam_name)
4240 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4241
4242 if (!strv_isempty(c->read_write_paths)) {
4243 fprintf(f, "%sReadWritePaths:", prefix);
4244 strv_fprintf(f, c->read_write_paths);
4245 fputs("\n", f);
4246 }
4247
4248 if (!strv_isempty(c->read_only_paths)) {
4249 fprintf(f, "%sReadOnlyPaths:", prefix);
4250 strv_fprintf(f, c->read_only_paths);
4251 fputs("\n", f);
4252 }
4253
4254 if (!strv_isempty(c->inaccessible_paths)) {
4255 fprintf(f, "%sInaccessiblePaths:", prefix);
4256 strv_fprintf(f, c->inaccessible_paths);
4257 fputs("\n", f);
4258 }
4259
4260 if (c->n_bind_mounts > 0)
4261 for (i = 0; i < c->n_bind_mounts; i++)
4262 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4263 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4264 c->bind_mounts[i].ignore_enoent ? "-": "",
4265 c->bind_mounts[i].source,
4266 c->bind_mounts[i].destination,
4267 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4268
4269 if (c->n_temporary_filesystems > 0)
4270 for (i = 0; i < c->n_temporary_filesystems; i++) {
4271 TemporaryFileSystem *t = c->temporary_filesystems + i;
4272
4273 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4274 t->path,
4275 isempty(t->options) ? "" : ":",
4276 strempty(t->options));
4277 }
4278
4279 if (c->utmp_id)
4280 fprintf(f,
4281 "%sUtmpIdentifier: %s\n",
4282 prefix, c->utmp_id);
4283
4284 if (c->selinux_context)
4285 fprintf(f,
4286 "%sSELinuxContext: %s%s\n",
4287 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4288
4289 if (c->apparmor_profile)
4290 fprintf(f,
4291 "%sAppArmorProfile: %s%s\n",
4292 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4293
4294 if (c->smack_process_label)
4295 fprintf(f,
4296 "%sSmackProcessLabel: %s%s\n",
4297 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4298
4299 if (c->personality != PERSONALITY_INVALID)
4300 fprintf(f,
4301 "%sPersonality: %s\n",
4302 prefix, strna(personality_to_string(c->personality)));
4303
4304 fprintf(f,
4305 "%sLockPersonality: %s\n",
4306 prefix, yes_no(c->lock_personality));
4307
4308 if (c->syscall_filter) {
4309 #if HAVE_SECCOMP
4310 Iterator j;
4311 void *id, *val;
4312 bool first = true;
4313 #endif
4314
4315 fprintf(f,
4316 "%sSystemCallFilter: ",
4317 prefix);
4318
4319 if (!c->syscall_whitelist)
4320 fputc('~', f);
4321
4322 #if HAVE_SECCOMP
4323 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4324 _cleanup_free_ char *name = NULL;
4325 const char *errno_name = NULL;
4326 int num = PTR_TO_INT(val);
4327
4328 if (first)
4329 first = false;
4330 else
4331 fputc(' ', f);
4332
4333 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4334 fputs(strna(name), f);
4335
4336 if (num >= 0) {
4337 errno_name = errno_to_name(num);
4338 if (errno_name)
4339 fprintf(f, ":%s", errno_name);
4340 else
4341 fprintf(f, ":%d", num);
4342 }
4343 }
4344 #endif
4345
4346 fputc('\n', f);
4347 }
4348
4349 if (c->syscall_archs) {
4350 #if HAVE_SECCOMP
4351 Iterator j;
4352 void *id;
4353 #endif
4354
4355 fprintf(f,
4356 "%sSystemCallArchitectures:",
4357 prefix);
4358
4359 #if HAVE_SECCOMP
4360 SET_FOREACH(id, c->syscall_archs, j)
4361 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4362 #endif
4363 fputc('\n', f);
4364 }
4365
4366 if (exec_context_restrict_namespaces_set(c)) {
4367 _cleanup_free_ char *s = NULL;
4368
4369 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4370 if (r >= 0)
4371 fprintf(f, "%sRestrictNamespaces: %s\n",
4372 prefix, s);
4373 }
4374
4375 if (c->syscall_errno > 0) {
4376 const char *errno_name;
4377
4378 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4379
4380 errno_name = errno_to_name(c->syscall_errno);
4381 if (errno_name)
4382 fprintf(f, "%s\n", errno_name);
4383 else
4384 fprintf(f, "%d\n", c->syscall_errno);
4385 }
4386
4387 if (c->apparmor_profile)
4388 fprintf(f,
4389 "%sAppArmorProfile: %s%s\n",
4390 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4391 }
4392
4393 bool exec_context_maintains_privileges(const ExecContext *c) {
4394 assert(c);
4395
4396 /* Returns true if the process forked off would run under
4397 * an unchanged UID or as root. */
4398
4399 if (!c->user)
4400 return true;
4401
4402 if (streq(c->user, "root") || streq(c->user, "0"))
4403 return true;
4404
4405 return false;
4406 }
4407
4408 int exec_context_get_effective_ioprio(const ExecContext *c) {
4409 int p;
4410
4411 assert(c);
4412
4413 if (c->ioprio_set)
4414 return c->ioprio;
4415
4416 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4417 if (p < 0)
4418 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4419
4420 return p;
4421 }
4422
4423 void exec_context_free_log_extra_fields(ExecContext *c) {
4424 size_t l;
4425
4426 assert(c);
4427
4428 for (l = 0; l < c->n_log_extra_fields; l++)
4429 free(c->log_extra_fields[l].iov_base);
4430 c->log_extra_fields = mfree(c->log_extra_fields);
4431 c->n_log_extra_fields = 0;
4432 }
4433
4434 void exec_status_start(ExecStatus *s, pid_t pid) {
4435 assert(s);
4436
4437 *s = (ExecStatus) {
4438 .pid = pid,
4439 };
4440
4441 dual_timestamp_get(&s->start_timestamp);
4442 }
4443
4444 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4445 assert(s);
4446
4447 if (s->pid != pid) {
4448 *s = (ExecStatus) {
4449 .pid = pid,
4450 };
4451 }
4452
4453 dual_timestamp_get(&s->exit_timestamp);
4454
4455 s->code = code;
4456 s->status = status;
4457
4458 if (context) {
4459 if (context->utmp_id)
4460 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4461
4462 exec_context_tty_reset(context, NULL);
4463 }
4464 }
4465
4466 void exec_status_reset(ExecStatus *s) {
4467 assert(s);
4468
4469 *s = (ExecStatus) {};
4470 }
4471
4472 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4473 char buf[FORMAT_TIMESTAMP_MAX];
4474
4475 assert(s);
4476 assert(f);
4477
4478 if (s->pid <= 0)
4479 return;
4480
4481 prefix = strempty(prefix);
4482
4483 fprintf(f,
4484 "%sPID: "PID_FMT"\n",
4485 prefix, s->pid);
4486
4487 if (dual_timestamp_is_set(&s->start_timestamp))
4488 fprintf(f,
4489 "%sStart Timestamp: %s\n",
4490 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4491
4492 if (dual_timestamp_is_set(&s->exit_timestamp))
4493 fprintf(f,
4494 "%sExit Timestamp: %s\n"
4495 "%sExit Code: %s\n"
4496 "%sExit Status: %i\n",
4497 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4498 prefix, sigchld_code_to_string(s->code),
4499 prefix, s->status);
4500 }
4501
4502 static char *exec_command_line(char **argv) {
4503 size_t k;
4504 char *n, *p, **a;
4505 bool first = true;
4506
4507 assert(argv);
4508
4509 k = 1;
4510 STRV_FOREACH(a, argv)
4511 k += strlen(*a)+3;
4512
4513 n = new(char, k);
4514 if (!n)
4515 return NULL;
4516
4517 p = n;
4518 STRV_FOREACH(a, argv) {
4519
4520 if (!first)
4521 *(p++) = ' ';
4522 else
4523 first = false;
4524
4525 if (strpbrk(*a, WHITESPACE)) {
4526 *(p++) = '\'';
4527 p = stpcpy(p, *a);
4528 *(p++) = '\'';
4529 } else
4530 p = stpcpy(p, *a);
4531
4532 }
4533
4534 *p = 0;
4535
4536 /* FIXME: this doesn't really handle arguments that have
4537 * spaces and ticks in them */
4538
4539 return n;
4540 }
4541
4542 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4543 _cleanup_free_ char *cmd = NULL;
4544 const char *prefix2;
4545
4546 assert(c);
4547 assert(f);
4548
4549 prefix = strempty(prefix);
4550 prefix2 = strjoina(prefix, "\t");
4551
4552 cmd = exec_command_line(c->argv);
4553 fprintf(f,
4554 "%sCommand Line: %s\n",
4555 prefix, cmd ? cmd : strerror(ENOMEM));
4556
4557 exec_status_dump(&c->exec_status, f, prefix2);
4558 }
4559
4560 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4561 assert(f);
4562
4563 prefix = strempty(prefix);
4564
4565 LIST_FOREACH(command, c, c)
4566 exec_command_dump(c, f, prefix);
4567 }
4568
4569 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4570 ExecCommand *end;
4571
4572 assert(l);
4573 assert(e);
4574
4575 if (*l) {
4576 /* It's kind of important, that we keep the order here */
4577 LIST_FIND_TAIL(command, *l, end);
4578 LIST_INSERT_AFTER(command, *l, end, e);
4579 } else
4580 *l = e;
4581 }
4582
4583 int exec_command_set(ExecCommand *c, const char *path, ...) {
4584 va_list ap;
4585 char **l, *p;
4586
4587 assert(c);
4588 assert(path);
4589
4590 va_start(ap, path);
4591 l = strv_new_ap(path, ap);
4592 va_end(ap);
4593
4594 if (!l)
4595 return -ENOMEM;
4596
4597 p = strdup(path);
4598 if (!p) {
4599 strv_free(l);
4600 return -ENOMEM;
4601 }
4602
4603 free(c->path);
4604 c->path = p;
4605
4606 return strv_free_and_replace(c->argv, l);
4607 }
4608
4609 int exec_command_append(ExecCommand *c, const char *path, ...) {
4610 _cleanup_strv_free_ char **l = NULL;
4611 va_list ap;
4612 int r;
4613
4614 assert(c);
4615 assert(path);
4616
4617 va_start(ap, path);
4618 l = strv_new_ap(path, ap);
4619 va_end(ap);
4620
4621 if (!l)
4622 return -ENOMEM;
4623
4624 r = strv_extend_strv(&c->argv, l, false);
4625 if (r < 0)
4626 return r;
4627
4628 return 0;
4629 }
4630
4631 static void *remove_tmpdir_thread(void *p) {
4632 _cleanup_free_ char *path = p;
4633
4634 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4635 return NULL;
4636 }
4637
4638 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4639 int r;
4640
4641 if (!rt)
4642 return NULL;
4643
4644 if (rt->manager)
4645 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4646
4647 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4648 if (destroy && rt->tmp_dir) {
4649 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4650
4651 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4652 if (r < 0) {
4653 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4654 free(rt->tmp_dir);
4655 }
4656
4657 rt->tmp_dir = NULL;
4658 }
4659
4660 if (destroy && rt->var_tmp_dir) {
4661 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4662
4663 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4664 if (r < 0) {
4665 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4666 free(rt->var_tmp_dir);
4667 }
4668
4669 rt->var_tmp_dir = NULL;
4670 }
4671
4672 rt->id = mfree(rt->id);
4673 rt->tmp_dir = mfree(rt->tmp_dir);
4674 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4675 safe_close_pair(rt->netns_storage_socket);
4676 return mfree(rt);
4677 }
4678
4679 static void exec_runtime_freep(ExecRuntime **rt) {
4680 if (*rt)
4681 (void) exec_runtime_free(*rt, false);
4682 }
4683
4684 static int exec_runtime_allocate(ExecRuntime **rt) {
4685 assert(rt);
4686
4687 *rt = new0(ExecRuntime, 1);
4688 if (!*rt)
4689 return -ENOMEM;
4690
4691 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4692 return 0;
4693 }
4694
4695 static int exec_runtime_add(
4696 Manager *m,
4697 const char *id,
4698 const char *tmp_dir,
4699 const char *var_tmp_dir,
4700 const int netns_storage_socket[2],
4701 ExecRuntime **ret) {
4702
4703 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4704 int r;
4705
4706 assert(m);
4707 assert(id);
4708
4709 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4710 if (r < 0)
4711 return r;
4712
4713 r = exec_runtime_allocate(&rt);
4714 if (r < 0)
4715 return r;
4716
4717 rt->id = strdup(id);
4718 if (!rt->id)
4719 return -ENOMEM;
4720
4721 if (tmp_dir) {
4722 rt->tmp_dir = strdup(tmp_dir);
4723 if (!rt->tmp_dir)
4724 return -ENOMEM;
4725
4726 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4727 assert(var_tmp_dir);
4728 rt->var_tmp_dir = strdup(var_tmp_dir);
4729 if (!rt->var_tmp_dir)
4730 return -ENOMEM;
4731 }
4732
4733 if (netns_storage_socket) {
4734 rt->netns_storage_socket[0] = netns_storage_socket[0];
4735 rt->netns_storage_socket[1] = netns_storage_socket[1];
4736 }
4737
4738 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4739 if (r < 0)
4740 return r;
4741
4742 rt->manager = m;
4743
4744 if (ret)
4745 *ret = rt;
4746
4747 /* do not remove created ExecRuntime object when the operation succeeds. */
4748 rt = NULL;
4749 return 0;
4750 }
4751
4752 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4753 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4754 _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4755 int r;
4756
4757 assert(m);
4758 assert(c);
4759 assert(id);
4760
4761 /* It is not necessary to create ExecRuntime object. */
4762 if (!c->private_network && !c->private_tmp)
4763 return 0;
4764
4765 if (c->private_tmp) {
4766 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4767 if (r < 0)
4768 return r;
4769 }
4770
4771 if (c->private_network) {
4772 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4773 return -errno;
4774 }
4775
4776 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4777 if (r < 0)
4778 return r;
4779
4780 /* Avoid cleanup */
4781 netns_storage_socket[0] = -1;
4782 netns_storage_socket[1] = -1;
4783 return 1;
4784 }
4785
4786 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4787 ExecRuntime *rt;
4788 int r;
4789
4790 assert(m);
4791 assert(id);
4792 assert(ret);
4793
4794 rt = hashmap_get(m->exec_runtime_by_id, id);
4795 if (rt)
4796 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4797 goto ref;
4798
4799 if (!create)
4800 return 0;
4801
4802 /* If not found, then create a new object. */
4803 r = exec_runtime_make(m, c, id, &rt);
4804 if (r <= 0)
4805 /* When r == 0, it is not necessary to create ExecRuntime object. */
4806 return r;
4807
4808 ref:
4809 /* increment reference counter. */
4810 rt->n_ref++;
4811 *ret = rt;
4812 return 1;
4813 }
4814
4815 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4816 if (!rt)
4817 return NULL;
4818
4819 assert(rt->n_ref > 0);
4820
4821 rt->n_ref--;
4822 if (rt->n_ref > 0)
4823 return NULL;
4824
4825 return exec_runtime_free(rt, destroy);
4826 }
4827
4828 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4829 ExecRuntime *rt;
4830 Iterator i;
4831
4832 assert(m);
4833 assert(f);
4834 assert(fds);
4835
4836 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4837 fprintf(f, "exec-runtime=%s", rt->id);
4838
4839 if (rt->tmp_dir)
4840 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4841
4842 if (rt->var_tmp_dir)
4843 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4844
4845 if (rt->netns_storage_socket[0] >= 0) {
4846 int copy;
4847
4848 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4849 if (copy < 0)
4850 return copy;
4851
4852 fprintf(f, " netns-socket-0=%i", copy);
4853 }
4854
4855 if (rt->netns_storage_socket[1] >= 0) {
4856 int copy;
4857
4858 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4859 if (copy < 0)
4860 return copy;
4861
4862 fprintf(f, " netns-socket-1=%i", copy);
4863 }
4864
4865 fputc('\n', f);
4866 }
4867
4868 return 0;
4869 }
4870
4871 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4872 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4873 ExecRuntime *rt;
4874 int r;
4875
4876 /* This is for the migration from old (v237 or earlier) deserialization text.
4877 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4878 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4879 * so or not from the serialized text, then we always creates a new object owned by this. */
4880
4881 assert(u);
4882 assert(key);
4883 assert(value);
4884
4885 /* Manager manages ExecRuntime objects by the unit id.
4886 * So, we omit the serialized text when the unit does not have id (yet?)... */
4887 if (isempty(u->id)) {
4888 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4889 return 0;
4890 }
4891
4892 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4893 if (r < 0) {
4894 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4895 return 0;
4896 }
4897
4898 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4899 if (!rt) {
4900 r = exec_runtime_allocate(&rt_create);
4901 if (r < 0)
4902 return log_oom();
4903
4904 rt_create->id = strdup(u->id);
4905 if (!rt_create->id)
4906 return log_oom();
4907
4908 rt = rt_create;
4909 }
4910
4911 if (streq(key, "tmp-dir")) {
4912 char *copy;
4913
4914 copy = strdup(value);
4915 if (!copy)
4916 return log_oom();
4917
4918 free_and_replace(rt->tmp_dir, copy);
4919
4920 } else if (streq(key, "var-tmp-dir")) {
4921 char *copy;
4922
4923 copy = strdup(value);
4924 if (!copy)
4925 return log_oom();
4926
4927 free_and_replace(rt->var_tmp_dir, copy);
4928
4929 } else if (streq(key, "netns-socket-0")) {
4930 int fd;
4931
4932 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4933 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4934 return 0;
4935 }
4936
4937 safe_close(rt->netns_storage_socket[0]);
4938 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4939
4940 } else if (streq(key, "netns-socket-1")) {
4941 int fd;
4942
4943 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4944 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4945 return 0;
4946 }
4947
4948 safe_close(rt->netns_storage_socket[1]);
4949 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4950 } else
4951 return 0;
4952
4953 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4954 if (rt_create) {
4955 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4956 if (r < 0) {
4957 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
4958 return 0;
4959 }
4960
4961 rt_create->manager = u->manager;
4962
4963 /* Avoid cleanup */
4964 rt_create = NULL;
4965 }
4966
4967 return 1;
4968 }
4969
4970 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4971 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4972 int r, fd0 = -1, fd1 = -1;
4973 const char *p, *v = value;
4974 size_t n;
4975
4976 assert(m);
4977 assert(value);
4978 assert(fds);
4979
4980 n = strcspn(v, " ");
4981 id = strndupa(v, n);
4982 if (v[n] != ' ')
4983 goto finalize;
4984 p = v + n + 1;
4985
4986 v = startswith(p, "tmp-dir=");
4987 if (v) {
4988 n = strcspn(v, " ");
4989 tmp_dir = strndupa(v, n);
4990 if (v[n] != ' ')
4991 goto finalize;
4992 p = v + n + 1;
4993 }
4994
4995 v = startswith(p, "var-tmp-dir=");
4996 if (v) {
4997 n = strcspn(v, " ");
4998 var_tmp_dir = strndupa(v, n);
4999 if (v[n] != ' ')
5000 goto finalize;
5001 p = v + n + 1;
5002 }
5003
5004 v = startswith(p, "netns-socket-0=");
5005 if (v) {
5006 char *buf;
5007
5008 n = strcspn(v, " ");
5009 buf = strndupa(v, n);
5010 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5011 log_debug("Unable to process exec-runtime netns fd specification.");
5012 return;
5013 }
5014 fd0 = fdset_remove(fds, fd0);
5015 if (v[n] != ' ')
5016 goto finalize;
5017 p = v + n + 1;
5018 }
5019
5020 v = startswith(p, "netns-socket-1=");
5021 if (v) {
5022 char *buf;
5023
5024 n = strcspn(v, " ");
5025 buf = strndupa(v, n);
5026 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5027 log_debug("Unable to process exec-runtime netns fd specification.");
5028 return;
5029 }
5030 fd1 = fdset_remove(fds, fd1);
5031 }
5032
5033 finalize:
5034
5035 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5036 if (r < 0) {
5037 log_debug_errno(r, "Failed to add exec-runtime: %m");
5038 return;
5039 }
5040 }
5041
5042 void exec_runtime_vacuum(Manager *m) {
5043 ExecRuntime *rt;
5044 Iterator i;
5045
5046 assert(m);
5047
5048 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5049
5050 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5051 if (rt->n_ref > 0)
5052 continue;
5053
5054 (void) exec_runtime_free(rt, false);
5055 }
5056 }
5057
5058 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5059 [EXEC_INPUT_NULL] = "null",
5060 [EXEC_INPUT_TTY] = "tty",
5061 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5062 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5063 [EXEC_INPUT_SOCKET] = "socket",
5064 [EXEC_INPUT_NAMED_FD] = "fd",
5065 [EXEC_INPUT_DATA] = "data",
5066 [EXEC_INPUT_FILE] = "file",
5067 };
5068
5069 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5070
5071 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5072 [EXEC_OUTPUT_INHERIT] = "inherit",
5073 [EXEC_OUTPUT_NULL] = "null",
5074 [EXEC_OUTPUT_TTY] = "tty",
5075 [EXEC_OUTPUT_SYSLOG] = "syslog",
5076 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5077 [EXEC_OUTPUT_KMSG] = "kmsg",
5078 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5079 [EXEC_OUTPUT_JOURNAL] = "journal",
5080 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5081 [EXEC_OUTPUT_SOCKET] = "socket",
5082 [EXEC_OUTPUT_NAMED_FD] = "fd",
5083 [EXEC_OUTPUT_FILE] = "file",
5084 [EXEC_OUTPUT_FILE_APPEND] = "append",
5085 };
5086
5087 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5088
5089 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5090 [EXEC_UTMP_INIT] = "init",
5091 [EXEC_UTMP_LOGIN] = "login",
5092 [EXEC_UTMP_USER] = "user",
5093 };
5094
5095 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5096
5097 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5098 [EXEC_PRESERVE_NO] = "no",
5099 [EXEC_PRESERVE_YES] = "yes",
5100 [EXEC_PRESERVE_RESTART] = "restart",
5101 };
5102
5103 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5104
5105 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5106 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5107 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5108 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5109 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5110 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5111 };
5112
5113 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5114
5115 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5116 [EXEC_KEYRING_INHERIT] = "inherit",
5117 [EXEC_KEYRING_PRIVATE] = "private",
5118 [EXEC_KEYRING_SHARED] = "shared",
5119 };
5120
5121 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);