]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
tree-wide: fix spelling errors
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/personality.h>
10 #include <sys/prctl.h>
11 #include <sys/shm.h>
12 #include <sys/types.h>
13 #include <sys/un.h>
14 #include <unistd.h>
15 #include <utmpx.h>
16
17 #if HAVE_PAM
18 #include <security/pam_appl.h>
19 #endif
20
21 #if HAVE_SELINUX
22 #include <selinux/selinux.h>
23 #endif
24
25 #if HAVE_SECCOMP
26 #include <seccomp.h>
27 #endif
28
29 #if HAVE_APPARMOR
30 #include <sys/apparmor.h>
31 #endif
32
33 #include "sd-messages.h"
34
35 #include "af-list.h"
36 #include "alloc-util.h"
37 #if HAVE_APPARMOR
38 #include "apparmor-util.h"
39 #endif
40 #include "async.h"
41 #include "barrier.h"
42 #include "cap-list.h"
43 #include "capability-util.h"
44 #include "chown-recursive.h"
45 #include "cgroup-setup.h"
46 #include "cpu-set-util.h"
47 #include "def.h"
48 #include "env-file.h"
49 #include "env-util.h"
50 #include "errno-list.h"
51 #include "execute.h"
52 #include "exit-status.h"
53 #include "fd-util.h"
54 #include "format-util.h"
55 #include "fs-util.h"
56 #include "glob-util.h"
57 #include "io-util.h"
58 #include "ioprio.h"
59 #include "label.h"
60 #include "log.h"
61 #include "macro.h"
62 #include "manager.h"
63 #include "memory-util.h"
64 #include "missing_fs.h"
65 #include "mkdir.h"
66 #include "namespace.h"
67 #include "parse-util.h"
68 #include "path-util.h"
69 #include "process-util.h"
70 #include "rlimit-util.h"
71 #include "rm-rf.h"
72 #if HAVE_SECCOMP
73 #include "seccomp-util.h"
74 #endif
75 #include "securebits-util.h"
76 #include "selinux-util.h"
77 #include "signal-util.h"
78 #include "smack-util.h"
79 #include "socket-util.h"
80 #include "special.h"
81 #include "stat-util.h"
82 #include "string-table.h"
83 #include "string-util.h"
84 #include "strv.h"
85 #include "syslog-util.h"
86 #include "terminal-util.h"
87 #include "umask-util.h"
88 #include "unit.h"
89 #include "user-util.h"
90 #include "utmp-wtmp.h"
91
92 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
93 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
94
95 #define SNDBUF_SIZE (8*1024*1024)
96
97 static int shift_fds(int fds[], size_t n_fds) {
98 int start, restart_from;
99
100 if (n_fds <= 0)
101 return 0;
102
103 /* Modifies the fds array! (sorts it) */
104
105 assert(fds);
106
107 start = 0;
108 for (;;) {
109 int i;
110
111 restart_from = -1;
112
113 for (i = start; i < (int) n_fds; i++) {
114 int nfd;
115
116 /* Already at right index? */
117 if (fds[i] == i+3)
118 continue;
119
120 nfd = fcntl(fds[i], F_DUPFD, i + 3);
121 if (nfd < 0)
122 return -errno;
123
124 safe_close(fds[i]);
125 fds[i] = nfd;
126
127 /* Hmm, the fd we wanted isn't free? Then
128 * let's remember that and try again from here */
129 if (nfd != i+3 && restart_from < 0)
130 restart_from = i;
131 }
132
133 if (restart_from < 0)
134 break;
135
136 start = restart_from;
137 }
138
139 return 0;
140 }
141
142 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
143 size_t i, n_fds;
144 int r;
145
146 n_fds = n_socket_fds + n_storage_fds;
147 if (n_fds <= 0)
148 return 0;
149
150 assert(fds);
151
152 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
153 * O_NONBLOCK only applies to socket activation though. */
154
155 for (i = 0; i < n_fds; i++) {
156
157 if (i < n_socket_fds) {
158 r = fd_nonblock(fds[i], nonblock);
159 if (r < 0)
160 return r;
161 }
162
163 /* We unconditionally drop FD_CLOEXEC from the fds,
164 * since after all we want to pass these fds to our
165 * children */
166
167 r = fd_cloexec(fds[i], false);
168 if (r < 0)
169 return r;
170 }
171
172 return 0;
173 }
174
175 static const char *exec_context_tty_path(const ExecContext *context) {
176 assert(context);
177
178 if (context->stdio_as_fds)
179 return NULL;
180
181 if (context->tty_path)
182 return context->tty_path;
183
184 return "/dev/console";
185 }
186
187 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
188 const char *path;
189
190 assert(context);
191
192 path = exec_context_tty_path(context);
193
194 if (context->tty_vhangup) {
195 if (p && p->stdin_fd >= 0)
196 (void) terminal_vhangup_fd(p->stdin_fd);
197 else if (path)
198 (void) terminal_vhangup(path);
199 }
200
201 if (context->tty_reset) {
202 if (p && p->stdin_fd >= 0)
203 (void) reset_terminal_fd(p->stdin_fd, true);
204 else if (path)
205 (void) reset_terminal(path);
206 }
207
208 if (context->tty_vt_disallocate && path)
209 (void) vt_disallocate(path);
210 }
211
212 static bool is_terminal_input(ExecInput i) {
213 return IN_SET(i,
214 EXEC_INPUT_TTY,
215 EXEC_INPUT_TTY_FORCE,
216 EXEC_INPUT_TTY_FAIL);
217 }
218
219 static bool is_terminal_output(ExecOutput o) {
220 return IN_SET(o,
221 EXEC_OUTPUT_TTY,
222 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
223 EXEC_OUTPUT_KMSG_AND_CONSOLE,
224 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
225 }
226
227 static bool is_syslog_output(ExecOutput o) {
228 return IN_SET(o,
229 EXEC_OUTPUT_SYSLOG,
230 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
231 }
232
233 static bool is_kmsg_output(ExecOutput o) {
234 return IN_SET(o,
235 EXEC_OUTPUT_KMSG,
236 EXEC_OUTPUT_KMSG_AND_CONSOLE);
237 }
238
239 static bool exec_context_needs_term(const ExecContext *c) {
240 assert(c);
241
242 /* Return true if the execution context suggests we should set $TERM to something useful. */
243
244 if (is_terminal_input(c->std_input))
245 return true;
246
247 if (is_terminal_output(c->std_output))
248 return true;
249
250 if (is_terminal_output(c->std_error))
251 return true;
252
253 return !!c->tty_path;
254 }
255
256 static int open_null_as(int flags, int nfd) {
257 int fd;
258
259 assert(nfd >= 0);
260
261 fd = open("/dev/null", flags|O_NOCTTY);
262 if (fd < 0)
263 return -errno;
264
265 return move_fd(fd, nfd, false);
266 }
267
268 static int connect_journal_socket(
269 int fd,
270 const char *log_namespace,
271 uid_t uid,
272 gid_t gid) {
273
274 union sockaddr_union sa;
275 socklen_t sa_len;
276 uid_t olduid = UID_INVALID;
277 gid_t oldgid = GID_INVALID;
278 const char *j;
279 int r;
280
281 j = log_namespace ?
282 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
283 "/run/systemd/journal/stdout";
284 r = sockaddr_un_set_path(&sa.un, j);
285 if (r < 0)
286 return r;
287 sa_len = r;
288
289 if (gid_is_valid(gid)) {
290 oldgid = getgid();
291
292 if (setegid(gid) < 0)
293 return -errno;
294 }
295
296 if (uid_is_valid(uid)) {
297 olduid = getuid();
298
299 if (seteuid(uid) < 0) {
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
305 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
306
307 /* If we fail to restore the uid or gid, things will likely
308 fail later on. This should only happen if an LSM interferes. */
309
310 if (uid_is_valid(uid))
311 (void) seteuid(olduid);
312
313 restore_gid:
314 if (gid_is_valid(gid))
315 (void) setegid(oldgid);
316
317 return r;
318 }
319
320 static int connect_logger_as(
321 const Unit *unit,
322 const ExecContext *context,
323 const ExecParameters *params,
324 ExecOutput output,
325 const char *ident,
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
330 _cleanup_close_ int fd = -1;
331 int r;
332
333 assert(context);
334 assert(params);
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
338
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
341 return -errno;
342
343 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
344 if (r < 0)
345 return r;
346
347 if (shutdown(fd, SHUT_RD) < 0)
348 return -errno;
349
350 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
351
352 if (dprintf(fd,
353 "%s\n"
354 "%s\n"
355 "%i\n"
356 "%i\n"
357 "%i\n"
358 "%i\n"
359 "%i\n",
360 context->syslog_identifier ?: ident,
361 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
362 context->syslog_priority,
363 !!context->syslog_level_prefix,
364 is_syslog_output(output),
365 is_kmsg_output(output),
366 is_terminal_output(output)) < 0)
367 return -errno;
368
369 return move_fd(TAKE_FD(fd), nfd, false);
370 }
371
372 static int open_terminal_as(const char *path, int flags, int nfd) {
373 int fd;
374
375 assert(path);
376 assert(nfd >= 0);
377
378 fd = open_terminal(path, flags | O_NOCTTY);
379 if (fd < 0)
380 return fd;
381
382 return move_fd(fd, nfd, false);
383 }
384
385 static int acquire_path(const char *path, int flags, mode_t mode) {
386 union sockaddr_union sa;
387 socklen_t sa_len;
388 _cleanup_close_ int fd = -1;
389 int r;
390
391 assert(path);
392
393 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
394 flags |= O_CREAT;
395
396 fd = open(path, flags|O_NOCTTY, mode);
397 if (fd >= 0)
398 return TAKE_FD(fd);
399
400 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
401 return -errno;
402
403 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
404
405 r = sockaddr_un_set_path(&sa.un, path);
406 if (r < 0)
407 return r == -EINVAL ? -ENXIO : r;
408 sa_len = r;
409
410 fd = socket(AF_UNIX, SOCK_STREAM, 0);
411 if (fd < 0)
412 return -errno;
413
414 if (connect(fd, &sa.sa, sa_len) < 0)
415 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
416 * indication that his wasn't an AF_UNIX socket after all */
417
418 if ((flags & O_ACCMODE) == O_RDONLY)
419 r = shutdown(fd, SHUT_WR);
420 else if ((flags & O_ACCMODE) == O_WRONLY)
421 r = shutdown(fd, SHUT_RD);
422 else
423 r = 0;
424 if (r < 0)
425 return -errno;
426
427 return TAKE_FD(fd);
428 }
429
430 static int fixup_input(
431 const ExecContext *context,
432 int socket_fd,
433 bool apply_tty_stdin) {
434
435 ExecInput std_input;
436
437 assert(context);
438
439 std_input = context->std_input;
440
441 if (is_terminal_input(std_input) && !apply_tty_stdin)
442 return EXEC_INPUT_NULL;
443
444 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
445 return EXEC_INPUT_NULL;
446
447 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
448 return EXEC_INPUT_NULL;
449
450 return std_input;
451 }
452
453 static int fixup_output(ExecOutput std_output, int socket_fd) {
454
455 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
456 return EXEC_OUTPUT_INHERIT;
457
458 return std_output;
459 }
460
461 static int setup_input(
462 const ExecContext *context,
463 const ExecParameters *params,
464 int socket_fd,
465 const int named_iofds[static 3]) {
466
467 ExecInput i;
468
469 assert(context);
470 assert(params);
471 assert(named_iofds);
472
473 if (params->stdin_fd >= 0) {
474 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
475 return -errno;
476
477 /* Try to make this the controlling tty, if it is a tty, and reset it */
478 if (isatty(STDIN_FILENO)) {
479 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
480 (void) reset_terminal_fd(STDIN_FILENO, true);
481 }
482
483 return STDIN_FILENO;
484 }
485
486 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
487
488 switch (i) {
489
490 case EXEC_INPUT_NULL:
491 return open_null_as(O_RDONLY, STDIN_FILENO);
492
493 case EXEC_INPUT_TTY:
494 case EXEC_INPUT_TTY_FORCE:
495 case EXEC_INPUT_TTY_FAIL: {
496 int fd;
497
498 fd = acquire_terminal(exec_context_tty_path(context),
499 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
500 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
501 ACQUIRE_TERMINAL_WAIT,
502 USEC_INFINITY);
503 if (fd < 0)
504 return fd;
505
506 return move_fd(fd, STDIN_FILENO, false);
507 }
508
509 case EXEC_INPUT_SOCKET:
510 assert(socket_fd >= 0);
511
512 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
513
514 case EXEC_INPUT_NAMED_FD:
515 assert(named_iofds[STDIN_FILENO] >= 0);
516
517 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
518 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
519
520 case EXEC_INPUT_DATA: {
521 int fd;
522
523 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
524 if (fd < 0)
525 return fd;
526
527 return move_fd(fd, STDIN_FILENO, false);
528 }
529
530 case EXEC_INPUT_FILE: {
531 bool rw;
532 int fd;
533
534 assert(context->stdio_file[STDIN_FILENO]);
535
536 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
537 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
538
539 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
540 if (fd < 0)
541 return fd;
542
543 return move_fd(fd, STDIN_FILENO, false);
544 }
545
546 default:
547 assert_not_reached("Unknown input type");
548 }
549 }
550
551 static bool can_inherit_stderr_from_stdout(
552 const ExecContext *context,
553 ExecOutput o,
554 ExecOutput e) {
555
556 assert(context);
557
558 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
559 * stderr fd */
560
561 if (e == EXEC_OUTPUT_INHERIT)
562 return true;
563 if (e != o)
564 return false;
565
566 if (e == EXEC_OUTPUT_NAMED_FD)
567 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
568
569 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
570 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
571
572 return true;
573 }
574
575 static int setup_output(
576 const Unit *unit,
577 const ExecContext *context,
578 const ExecParameters *params,
579 int fileno,
580 int socket_fd,
581 const int named_iofds[static 3],
582 const char *ident,
583 uid_t uid,
584 gid_t gid,
585 dev_t *journal_stream_dev,
586 ino_t *journal_stream_ino) {
587
588 ExecOutput o;
589 ExecInput i;
590 int r;
591
592 assert(unit);
593 assert(context);
594 assert(params);
595 assert(ident);
596 assert(journal_stream_dev);
597 assert(journal_stream_ino);
598
599 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
600
601 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
602 return -errno;
603
604 return STDOUT_FILENO;
605 }
606
607 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
608 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
609 return -errno;
610
611 return STDERR_FILENO;
612 }
613
614 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
615 o = fixup_output(context->std_output, socket_fd);
616
617 if (fileno == STDERR_FILENO) {
618 ExecOutput e;
619 e = fixup_output(context->std_error, socket_fd);
620
621 /* This expects the input and output are already set up */
622
623 /* Don't change the stderr file descriptor if we inherit all
624 * the way and are not on a tty */
625 if (e == EXEC_OUTPUT_INHERIT &&
626 o == EXEC_OUTPUT_INHERIT &&
627 i == EXEC_INPUT_NULL &&
628 !is_terminal_input(context->std_input) &&
629 getppid () != 1)
630 return fileno;
631
632 /* Duplicate from stdout if possible */
633 if (can_inherit_stderr_from_stdout(context, o, e))
634 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
635
636 o = e;
637
638 } else if (o == EXEC_OUTPUT_INHERIT) {
639 /* If input got downgraded, inherit the original value */
640 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
641 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
642
643 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
644 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
645 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
646
647 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
648 if (getppid() != 1)
649 return fileno;
650
651 /* We need to open /dev/null here anew, to get the right access mode. */
652 return open_null_as(O_WRONLY, fileno);
653 }
654
655 switch (o) {
656
657 case EXEC_OUTPUT_NULL:
658 return open_null_as(O_WRONLY, fileno);
659
660 case EXEC_OUTPUT_TTY:
661 if (is_terminal_input(i))
662 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
663
664 /* We don't reset the terminal if this is just about output */
665 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
666
667 case EXEC_OUTPUT_SYSLOG:
668 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
669 case EXEC_OUTPUT_KMSG:
670 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
671 case EXEC_OUTPUT_JOURNAL:
672 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
673 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
674 if (r < 0) {
675 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
676 r = open_null_as(O_WRONLY, fileno);
677 } else {
678 struct stat st;
679
680 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
681 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
682 * services to detect whether they are connected to the journal or not.
683 *
684 * If both stdout and stderr are connected to a stream then let's make sure to store the data
685 * about STDERR as that's usually the best way to do logging. */
686
687 if (fstat(fileno, &st) >= 0 &&
688 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
689 *journal_stream_dev = st.st_dev;
690 *journal_stream_ino = st.st_ino;
691 }
692 }
693 return r;
694
695 case EXEC_OUTPUT_SOCKET:
696 assert(socket_fd >= 0);
697
698 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
699
700 case EXEC_OUTPUT_NAMED_FD:
701 assert(named_iofds[fileno] >= 0);
702
703 (void) fd_nonblock(named_iofds[fileno], false);
704 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
705
706 case EXEC_OUTPUT_FILE:
707 case EXEC_OUTPUT_FILE_APPEND: {
708 bool rw;
709 int fd, flags;
710
711 assert(context->stdio_file[fileno]);
712
713 rw = context->std_input == EXEC_INPUT_FILE &&
714 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
715
716 if (rw)
717 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
718
719 flags = O_WRONLY;
720 if (o == EXEC_OUTPUT_FILE_APPEND)
721 flags |= O_APPEND;
722
723 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
724 if (fd < 0)
725 return fd;
726
727 return move_fd(fd, fileno, 0);
728 }
729
730 default:
731 assert_not_reached("Unknown error type");
732 }
733 }
734
735 static int chown_terminal(int fd, uid_t uid) {
736 int r;
737
738 assert(fd >= 0);
739
740 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
741 if (isatty(fd) < 1) {
742 if (IN_SET(errno, EINVAL, ENOTTY))
743 return 0; /* not a tty */
744
745 return -errno;
746 }
747
748 /* This might fail. What matters are the results. */
749 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
750 if (r < 0)
751 return r;
752
753 return 1;
754 }
755
756 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
757 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
758 int r;
759
760 assert(_saved_stdin);
761 assert(_saved_stdout);
762
763 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
764 if (saved_stdin < 0)
765 return -errno;
766
767 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
768 if (saved_stdout < 0)
769 return -errno;
770
771 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
772 if (fd < 0)
773 return fd;
774
775 r = chown_terminal(fd, getuid());
776 if (r < 0)
777 return r;
778
779 r = reset_terminal_fd(fd, true);
780 if (r < 0)
781 return r;
782
783 r = rearrange_stdio(fd, fd, STDERR_FILENO);
784 fd = -1;
785 if (r < 0)
786 return r;
787
788 *_saved_stdin = saved_stdin;
789 *_saved_stdout = saved_stdout;
790
791 saved_stdin = saved_stdout = -1;
792
793 return 0;
794 }
795
796 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
797 assert(err < 0);
798
799 if (err == -ETIMEDOUT)
800 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
801 else {
802 errno = -err;
803 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
804 }
805 }
806
807 static void write_confirm_error(int err, const char *vc, const Unit *u) {
808 _cleanup_close_ int fd = -1;
809
810 assert(vc);
811
812 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
813 if (fd < 0)
814 return;
815
816 write_confirm_error_fd(err, fd, u);
817 }
818
819 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
820 int r = 0;
821
822 assert(saved_stdin);
823 assert(saved_stdout);
824
825 release_terminal();
826
827 if (*saved_stdin >= 0)
828 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
829 r = -errno;
830
831 if (*saved_stdout >= 0)
832 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
833 r = -errno;
834
835 *saved_stdin = safe_close(*saved_stdin);
836 *saved_stdout = safe_close(*saved_stdout);
837
838 return r;
839 }
840
841 enum {
842 CONFIRM_PRETEND_FAILURE = -1,
843 CONFIRM_PRETEND_SUCCESS = 0,
844 CONFIRM_EXECUTE = 1,
845 };
846
847 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
848 int saved_stdout = -1, saved_stdin = -1, r;
849 _cleanup_free_ char *e = NULL;
850 char c;
851
852 /* For any internal errors, assume a positive response. */
853 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
854 if (r < 0) {
855 write_confirm_error(r, vc, u);
856 return CONFIRM_EXECUTE;
857 }
858
859 /* confirm_spawn might have been disabled while we were sleeping. */
860 if (manager_is_confirm_spawn_disabled(u->manager)) {
861 r = 1;
862 goto restore_stdio;
863 }
864
865 e = ellipsize(cmdline, 60, 100);
866 if (!e) {
867 log_oom();
868 r = CONFIRM_EXECUTE;
869 goto restore_stdio;
870 }
871
872 for (;;) {
873 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
874 if (r < 0) {
875 write_confirm_error_fd(r, STDOUT_FILENO, u);
876 r = CONFIRM_EXECUTE;
877 goto restore_stdio;
878 }
879
880 switch (c) {
881 case 'c':
882 printf("Resuming normal execution.\n");
883 manager_disable_confirm_spawn();
884 r = 1;
885 break;
886 case 'D':
887 unit_dump(u, stdout, " ");
888 continue; /* ask again */
889 case 'f':
890 printf("Failing execution.\n");
891 r = CONFIRM_PRETEND_FAILURE;
892 break;
893 case 'h':
894 printf(" c - continue, proceed without asking anymore\n"
895 " D - dump, show the state of the unit\n"
896 " f - fail, don't execute the command and pretend it failed\n"
897 " h - help\n"
898 " i - info, show a short summary of the unit\n"
899 " j - jobs, show jobs that are in progress\n"
900 " s - skip, don't execute the command and pretend it succeeded\n"
901 " y - yes, execute the command\n");
902 continue; /* ask again */
903 case 'i':
904 printf(" Description: %s\n"
905 " Unit: %s\n"
906 " Command: %s\n",
907 u->id, u->description, cmdline);
908 continue; /* ask again */
909 case 'j':
910 manager_dump_jobs(u->manager, stdout, " ");
911 continue; /* ask again */
912 case 'n':
913 /* 'n' was removed in favor of 'f'. */
914 printf("Didn't understand 'n', did you mean 'f'?\n");
915 continue; /* ask again */
916 case 's':
917 printf("Skipping execution.\n");
918 r = CONFIRM_PRETEND_SUCCESS;
919 break;
920 case 'y':
921 r = CONFIRM_EXECUTE;
922 break;
923 default:
924 assert_not_reached("Unhandled choice");
925 }
926 break;
927 }
928
929 restore_stdio:
930 restore_confirm_stdio(&saved_stdin, &saved_stdout);
931 return r;
932 }
933
934 static int get_fixed_user(const ExecContext *c, const char **user,
935 uid_t *uid, gid_t *gid,
936 const char **home, const char **shell) {
937 int r;
938 const char *name;
939
940 assert(c);
941
942 if (!c->user)
943 return 0;
944
945 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
946 * (i.e. are "/" or "/bin/nologin"). */
947
948 name = c->user;
949 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
950 if (r < 0)
951 return r;
952
953 *user = name;
954 return 0;
955 }
956
957 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
958 int r;
959 const char *name;
960
961 assert(c);
962
963 if (!c->group)
964 return 0;
965
966 name = c->group;
967 r = get_group_creds(&name, gid, 0);
968 if (r < 0)
969 return r;
970
971 *group = name;
972 return 0;
973 }
974
975 static int get_supplementary_groups(const ExecContext *c, const char *user,
976 const char *group, gid_t gid,
977 gid_t **supplementary_gids, int *ngids) {
978 char **i;
979 int r, k = 0;
980 int ngroups_max;
981 bool keep_groups = false;
982 gid_t *groups = NULL;
983 _cleanup_free_ gid_t *l_gids = NULL;
984
985 assert(c);
986
987 /*
988 * If user is given, then lookup GID and supplementary groups list.
989 * We avoid NSS lookups for gid=0. Also we have to initialize groups
990 * here and as early as possible so we keep the list of supplementary
991 * groups of the caller.
992 */
993 if (user && gid_is_valid(gid) && gid != 0) {
994 /* First step, initialize groups from /etc/groups */
995 if (initgroups(user, gid) < 0)
996 return -errno;
997
998 keep_groups = true;
999 }
1000
1001 if (strv_isempty(c->supplementary_groups))
1002 return 0;
1003
1004 /*
1005 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1006 * be positive, otherwise fail.
1007 */
1008 errno = 0;
1009 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1010 if (ngroups_max <= 0)
1011 return errno_or_else(EOPNOTSUPP);
1012
1013 l_gids = new(gid_t, ngroups_max);
1014 if (!l_gids)
1015 return -ENOMEM;
1016
1017 if (keep_groups) {
1018 /*
1019 * Lookup the list of groups that the user belongs to, we
1020 * avoid NSS lookups here too for gid=0.
1021 */
1022 k = ngroups_max;
1023 if (getgrouplist(user, gid, l_gids, &k) < 0)
1024 return -EINVAL;
1025 } else
1026 k = 0;
1027
1028 STRV_FOREACH(i, c->supplementary_groups) {
1029 const char *g;
1030
1031 if (k >= ngroups_max)
1032 return -E2BIG;
1033
1034 g = *i;
1035 r = get_group_creds(&g, l_gids+k, 0);
1036 if (r < 0)
1037 return r;
1038
1039 k++;
1040 }
1041
1042 /*
1043 * Sets ngids to zero to drop all supplementary groups, happens
1044 * when we are under root and SupplementaryGroups= is empty.
1045 */
1046 if (k == 0) {
1047 *ngids = 0;
1048 return 0;
1049 }
1050
1051 /* Otherwise get the final list of supplementary groups */
1052 groups = memdup(l_gids, sizeof(gid_t) * k);
1053 if (!groups)
1054 return -ENOMEM;
1055
1056 *supplementary_gids = groups;
1057 *ngids = k;
1058
1059 groups = NULL;
1060
1061 return 0;
1062 }
1063
1064 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1065 int r;
1066
1067 /* Handle SupplementaryGroups= if it is not empty */
1068 if (ngids > 0) {
1069 r = maybe_setgroups(ngids, supplementary_gids);
1070 if (r < 0)
1071 return r;
1072 }
1073
1074 if (gid_is_valid(gid)) {
1075 /* Then set our gids */
1076 if (setresgid(gid, gid, gid) < 0)
1077 return -errno;
1078 }
1079
1080 return 0;
1081 }
1082
1083 static int enforce_user(const ExecContext *context, uid_t uid) {
1084 assert(context);
1085
1086 if (!uid_is_valid(uid))
1087 return 0;
1088
1089 /* Sets (but doesn't look up) the uid and make sure we keep the
1090 * capabilities while doing so. */
1091
1092 if (context->capability_ambient_set != 0) {
1093
1094 /* First step: If we need to keep capabilities but
1095 * drop privileges we need to make sure we keep our
1096 * caps, while we drop privileges. */
1097 if (uid != 0) {
1098 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1099
1100 if (prctl(PR_GET_SECUREBITS) != sb)
1101 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1102 return -errno;
1103 }
1104 }
1105
1106 /* Second step: actually set the uids */
1107 if (setresuid(uid, uid, uid) < 0)
1108 return -errno;
1109
1110 /* At this point we should have all necessary capabilities but
1111 are otherwise a normal user. However, the caps might got
1112 corrupted due to the setresuid() so we need clean them up
1113 later. This is done outside of this call. */
1114
1115 return 0;
1116 }
1117
1118 #if HAVE_PAM
1119
1120 static int null_conv(
1121 int num_msg,
1122 const struct pam_message **msg,
1123 struct pam_response **resp,
1124 void *appdata_ptr) {
1125
1126 /* We don't support conversations */
1127
1128 return PAM_CONV_ERR;
1129 }
1130
1131 #endif
1132
1133 static int setup_pam(
1134 const char *name,
1135 const char *user,
1136 uid_t uid,
1137 gid_t gid,
1138 const char *tty,
1139 char ***env,
1140 const int fds[], size_t n_fds) {
1141
1142 #if HAVE_PAM
1143
1144 static const struct pam_conv conv = {
1145 .conv = null_conv,
1146 .appdata_ptr = NULL
1147 };
1148
1149 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1150 pam_handle_t *handle = NULL;
1151 sigset_t old_ss;
1152 int pam_code = PAM_SUCCESS, r;
1153 char **nv, **e = NULL;
1154 bool close_session = false;
1155 pid_t pam_pid = 0, parent_pid;
1156 int flags = 0;
1157
1158 assert(name);
1159 assert(user);
1160 assert(env);
1161
1162 /* We set up PAM in the parent process, then fork. The child
1163 * will then stay around until killed via PR_GET_PDEATHSIG or
1164 * systemd via the cgroup logic. It will then remove the PAM
1165 * session again. The parent process will exec() the actual
1166 * daemon. We do things this way to ensure that the main PID
1167 * of the daemon is the one we initially fork()ed. */
1168
1169 r = barrier_create(&barrier);
1170 if (r < 0)
1171 goto fail;
1172
1173 if (log_get_max_level() < LOG_DEBUG)
1174 flags |= PAM_SILENT;
1175
1176 pam_code = pam_start(name, user, &conv, &handle);
1177 if (pam_code != PAM_SUCCESS) {
1178 handle = NULL;
1179 goto fail;
1180 }
1181
1182 if (!tty) {
1183 _cleanup_free_ char *q = NULL;
1184
1185 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1186 * out if that's the case, and read the TTY off it. */
1187
1188 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1189 tty = strjoina("/dev/", q);
1190 }
1191
1192 if (tty) {
1193 pam_code = pam_set_item(handle, PAM_TTY, tty);
1194 if (pam_code != PAM_SUCCESS)
1195 goto fail;
1196 }
1197
1198 STRV_FOREACH(nv, *env) {
1199 pam_code = pam_putenv(handle, *nv);
1200 if (pam_code != PAM_SUCCESS)
1201 goto fail;
1202 }
1203
1204 pam_code = pam_acct_mgmt(handle, flags);
1205 if (pam_code != PAM_SUCCESS)
1206 goto fail;
1207
1208 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1209 if (pam_code != PAM_SUCCESS)
1210 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1211
1212 pam_code = pam_open_session(handle, flags);
1213 if (pam_code != PAM_SUCCESS)
1214 goto fail;
1215
1216 close_session = true;
1217
1218 e = pam_getenvlist(handle);
1219 if (!e) {
1220 pam_code = PAM_BUF_ERR;
1221 goto fail;
1222 }
1223
1224 /* Block SIGTERM, so that we know that it won't get lost in
1225 * the child */
1226
1227 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1228
1229 parent_pid = getpid_cached();
1230
1231 r = safe_fork("(sd-pam)", 0, &pam_pid);
1232 if (r < 0)
1233 goto fail;
1234 if (r == 0) {
1235 int sig, ret = EXIT_PAM;
1236
1237 /* The child's job is to reset the PAM session on
1238 * termination */
1239 barrier_set_role(&barrier, BARRIER_CHILD);
1240
1241 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1242 * are open here that have been opened by PAM. */
1243 (void) close_many(fds, n_fds);
1244
1245 /* Drop privileges - we don't need any to pam_close_session
1246 * and this will make PR_SET_PDEATHSIG work in most cases.
1247 * If this fails, ignore the error - but expect sd-pam threads
1248 * to fail to exit normally */
1249
1250 r = maybe_setgroups(0, NULL);
1251 if (r < 0)
1252 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1253 if (setresgid(gid, gid, gid) < 0)
1254 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1255 if (setresuid(uid, uid, uid) < 0)
1256 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1257
1258 (void) ignore_signals(SIGPIPE, -1);
1259
1260 /* Wait until our parent died. This will only work if
1261 * the above setresuid() succeeds, otherwise the kernel
1262 * will not allow unprivileged parents kill their privileged
1263 * children this way. We rely on the control groups kill logic
1264 * to do the rest for us. */
1265 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1266 goto child_finish;
1267
1268 /* Tell the parent that our setup is done. This is especially
1269 * important regarding dropping privileges. Otherwise, unit
1270 * setup might race against our setresuid(2) call.
1271 *
1272 * If the parent aborted, we'll detect this below, hence ignore
1273 * return failure here. */
1274 (void) barrier_place(&barrier);
1275
1276 /* Check if our parent process might already have died? */
1277 if (getppid() == parent_pid) {
1278 sigset_t ss;
1279
1280 assert_se(sigemptyset(&ss) >= 0);
1281 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1282
1283 for (;;) {
1284 if (sigwait(&ss, &sig) < 0) {
1285 if (errno == EINTR)
1286 continue;
1287
1288 goto child_finish;
1289 }
1290
1291 assert(sig == SIGTERM);
1292 break;
1293 }
1294 }
1295
1296 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1297 if (pam_code != PAM_SUCCESS)
1298 goto child_finish;
1299
1300 /* If our parent died we'll end the session */
1301 if (getppid() != parent_pid) {
1302 pam_code = pam_close_session(handle, flags);
1303 if (pam_code != PAM_SUCCESS)
1304 goto child_finish;
1305 }
1306
1307 ret = 0;
1308
1309 child_finish:
1310 pam_end(handle, pam_code | flags);
1311 _exit(ret);
1312 }
1313
1314 barrier_set_role(&barrier, BARRIER_PARENT);
1315
1316 /* If the child was forked off successfully it will do all the
1317 * cleanups, so forget about the handle here. */
1318 handle = NULL;
1319
1320 /* Unblock SIGTERM again in the parent */
1321 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1322
1323 /* We close the log explicitly here, since the PAM modules
1324 * might have opened it, but we don't want this fd around. */
1325 closelog();
1326
1327 /* Synchronously wait for the child to initialize. We don't care for
1328 * errors as we cannot recover. However, warn loudly if it happens. */
1329 if (!barrier_place_and_sync(&barrier))
1330 log_error("PAM initialization failed");
1331
1332 return strv_free_and_replace(*env, e);
1333
1334 fail:
1335 if (pam_code != PAM_SUCCESS) {
1336 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1337 r = -EPERM; /* PAM errors do not map to errno */
1338 } else
1339 log_error_errno(r, "PAM failed: %m");
1340
1341 if (handle) {
1342 if (close_session)
1343 pam_code = pam_close_session(handle, flags);
1344
1345 pam_end(handle, pam_code | flags);
1346 }
1347
1348 strv_free(e);
1349 closelog();
1350
1351 return r;
1352 #else
1353 return 0;
1354 #endif
1355 }
1356
1357 static void rename_process_from_path(const char *path) {
1358 char process_name[11];
1359 const char *p;
1360 size_t l;
1361
1362 /* This resulting string must fit in 10 chars (i.e. the length
1363 * of "/sbin/init") to look pretty in /bin/ps */
1364
1365 p = basename(path);
1366 if (isempty(p)) {
1367 rename_process("(...)");
1368 return;
1369 }
1370
1371 l = strlen(p);
1372 if (l > 8) {
1373 /* The end of the process name is usually more
1374 * interesting, since the first bit might just be
1375 * "systemd-" */
1376 p = p + l - 8;
1377 l = 8;
1378 }
1379
1380 process_name[0] = '(';
1381 memcpy(process_name+1, p, l);
1382 process_name[1+l] = ')';
1383 process_name[1+l+1] = 0;
1384
1385 rename_process(process_name);
1386 }
1387
1388 static bool context_has_address_families(const ExecContext *c) {
1389 assert(c);
1390
1391 return c->address_families_whitelist ||
1392 !set_isempty(c->address_families);
1393 }
1394
1395 static bool context_has_syscall_filters(const ExecContext *c) {
1396 assert(c);
1397
1398 return c->syscall_whitelist ||
1399 !hashmap_isempty(c->syscall_filter);
1400 }
1401
1402 static bool context_has_no_new_privileges(const ExecContext *c) {
1403 assert(c);
1404
1405 if (c->no_new_privileges)
1406 return true;
1407
1408 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1409 return false;
1410
1411 /* We need NNP if we have any form of seccomp and are unprivileged */
1412 return context_has_address_families(c) ||
1413 c->memory_deny_write_execute ||
1414 c->restrict_realtime ||
1415 c->restrict_suid_sgid ||
1416 exec_context_restrict_namespaces_set(c) ||
1417 c->protect_clock ||
1418 c->protect_kernel_tunables ||
1419 c->protect_kernel_modules ||
1420 c->protect_kernel_logs ||
1421 c->private_devices ||
1422 context_has_syscall_filters(c) ||
1423 !set_isempty(c->syscall_archs) ||
1424 c->lock_personality ||
1425 c->protect_hostname;
1426 }
1427
1428 #if HAVE_SECCOMP
1429
1430 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1431
1432 if (is_seccomp_available())
1433 return false;
1434
1435 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1436 return true;
1437 }
1438
1439 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1440 uint32_t negative_action, default_action, action;
1441 int r;
1442
1443 assert(u);
1444 assert(c);
1445
1446 if (!context_has_syscall_filters(c))
1447 return 0;
1448
1449 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1450 return 0;
1451
1452 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1453
1454 if (c->syscall_whitelist) {
1455 default_action = negative_action;
1456 action = SCMP_ACT_ALLOW;
1457 } else {
1458 default_action = SCMP_ACT_ALLOW;
1459 action = negative_action;
1460 }
1461
1462 if (needs_ambient_hack) {
1463 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1464 if (r < 0)
1465 return r;
1466 }
1467
1468 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1469 }
1470
1471 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1472 assert(u);
1473 assert(c);
1474
1475 if (set_isempty(c->syscall_archs))
1476 return 0;
1477
1478 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1479 return 0;
1480
1481 return seccomp_restrict_archs(c->syscall_archs);
1482 }
1483
1484 static int apply_address_families(const Unit* u, const ExecContext *c) {
1485 assert(u);
1486 assert(c);
1487
1488 if (!context_has_address_families(c))
1489 return 0;
1490
1491 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1492 return 0;
1493
1494 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1495 }
1496
1497 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1498 assert(u);
1499 assert(c);
1500
1501 if (!c->memory_deny_write_execute)
1502 return 0;
1503
1504 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1505 return 0;
1506
1507 return seccomp_memory_deny_write_execute();
1508 }
1509
1510 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1511 assert(u);
1512 assert(c);
1513
1514 if (!c->restrict_realtime)
1515 return 0;
1516
1517 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1518 return 0;
1519
1520 return seccomp_restrict_realtime();
1521 }
1522
1523 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1524 assert(u);
1525 assert(c);
1526
1527 if (!c->restrict_suid_sgid)
1528 return 0;
1529
1530 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1531 return 0;
1532
1533 return seccomp_restrict_suid_sgid();
1534 }
1535
1536 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1537 assert(u);
1538 assert(c);
1539
1540 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1541 * let's protect even those systems where this is left on in the kernel. */
1542
1543 if (!c->protect_kernel_tunables)
1544 return 0;
1545
1546 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1547 return 0;
1548
1549 return seccomp_protect_sysctl();
1550 }
1551
1552 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1553 assert(u);
1554 assert(c);
1555
1556 /* Turn off module syscalls on ProtectKernelModules=yes */
1557
1558 if (!c->protect_kernel_modules)
1559 return 0;
1560
1561 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1562 return 0;
1563
1564 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1565 }
1566
1567 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1568 assert(u);
1569 assert(c);
1570
1571 if (!c->protect_kernel_logs)
1572 return 0;
1573
1574 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1575 return 0;
1576
1577 return seccomp_protect_syslog();
1578 }
1579
1580 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1581 assert(u);
1582 assert(c);
1583
1584 if (!c->protect_clock)
1585 return 0;
1586
1587 if (skip_seccomp_unavailable(u, "ProtectClock="))
1588 return 0;
1589
1590 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1591 }
1592
1593 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1594 assert(u);
1595 assert(c);
1596
1597 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1598
1599 if (!c->private_devices)
1600 return 0;
1601
1602 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1603 return 0;
1604
1605 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1606 }
1607
1608 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1609 assert(u);
1610 assert(c);
1611
1612 if (!exec_context_restrict_namespaces_set(c))
1613 return 0;
1614
1615 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1616 return 0;
1617
1618 return seccomp_restrict_namespaces(c->restrict_namespaces);
1619 }
1620
1621 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1622 unsigned long personality;
1623 int r;
1624
1625 assert(u);
1626 assert(c);
1627
1628 if (!c->lock_personality)
1629 return 0;
1630
1631 if (skip_seccomp_unavailable(u, "LockPersonality="))
1632 return 0;
1633
1634 personality = c->personality;
1635
1636 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1637 if (personality == PERSONALITY_INVALID) {
1638
1639 r = opinionated_personality(&personality);
1640 if (r < 0)
1641 return r;
1642 }
1643
1644 return seccomp_lock_personality(personality);
1645 }
1646
1647 #endif
1648
1649 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1650 int r;
1651
1652 assert(u);
1653 assert(c);
1654
1655 if (!c->protect_hostname)
1656 return 0;
1657
1658 if (ns_type_supported(NAMESPACE_UTS)) {
1659 if (unshare(CLONE_NEWUTS) < 0) {
1660 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1661 *ret_exit_status = EXIT_NAMESPACE;
1662 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1663 }
1664
1665 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1666 }
1667 } else
1668 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1669
1670 #if HAVE_SECCOMP
1671 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1672 return 0;
1673
1674 r = seccomp_protect_hostname();
1675 if (r < 0) {
1676 *ret_exit_status = EXIT_SECCOMP;
1677 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1678 }
1679 #endif
1680
1681 return 0;
1682 }
1683
1684 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1685 assert(idle_pipe);
1686
1687 idle_pipe[1] = safe_close(idle_pipe[1]);
1688 idle_pipe[2] = safe_close(idle_pipe[2]);
1689
1690 if (idle_pipe[0] >= 0) {
1691 int r;
1692
1693 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1694
1695 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1696 ssize_t n;
1697
1698 /* Signal systemd that we are bored and want to continue. */
1699 n = write(idle_pipe[3], "x", 1);
1700 if (n > 0)
1701 /* Wait for systemd to react to the signal above. */
1702 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1703 }
1704
1705 idle_pipe[0] = safe_close(idle_pipe[0]);
1706
1707 }
1708
1709 idle_pipe[3] = safe_close(idle_pipe[3]);
1710 }
1711
1712 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1713
1714 static int build_environment(
1715 const Unit *u,
1716 const ExecContext *c,
1717 const ExecParameters *p,
1718 size_t n_fds,
1719 const char *home,
1720 const char *username,
1721 const char *shell,
1722 dev_t journal_stream_dev,
1723 ino_t journal_stream_ino,
1724 char ***ret) {
1725
1726 _cleanup_strv_free_ char **our_env = NULL;
1727 ExecDirectoryType t;
1728 size_t n_env = 0;
1729 char *x;
1730
1731 assert(u);
1732 assert(c);
1733 assert(p);
1734 assert(ret);
1735
1736 our_env = new0(char*, 15 + _EXEC_DIRECTORY_TYPE_MAX);
1737 if (!our_env)
1738 return -ENOMEM;
1739
1740 if (n_fds > 0) {
1741 _cleanup_free_ char *joined = NULL;
1742
1743 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1744 return -ENOMEM;
1745 our_env[n_env++] = x;
1746
1747 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1748 return -ENOMEM;
1749 our_env[n_env++] = x;
1750
1751 joined = strv_join(p->fd_names, ":");
1752 if (!joined)
1753 return -ENOMEM;
1754
1755 x = strjoin("LISTEN_FDNAMES=", joined);
1756 if (!x)
1757 return -ENOMEM;
1758 our_env[n_env++] = x;
1759 }
1760
1761 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1762 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1763 return -ENOMEM;
1764 our_env[n_env++] = x;
1765
1766 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1767 return -ENOMEM;
1768 our_env[n_env++] = x;
1769 }
1770
1771 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1772 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1773 * check the database directly. */
1774 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1775 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1776 if (!x)
1777 return -ENOMEM;
1778 our_env[n_env++] = x;
1779 }
1780
1781 if (home) {
1782 x = strjoin("HOME=", home);
1783 if (!x)
1784 return -ENOMEM;
1785
1786 path_simplify(x + 5, true);
1787 our_env[n_env++] = x;
1788 }
1789
1790 if (username) {
1791 x = strjoin("LOGNAME=", username);
1792 if (!x)
1793 return -ENOMEM;
1794 our_env[n_env++] = x;
1795
1796 x = strjoin("USER=", username);
1797 if (!x)
1798 return -ENOMEM;
1799 our_env[n_env++] = x;
1800 }
1801
1802 if (shell) {
1803 x = strjoin("SHELL=", shell);
1804 if (!x)
1805 return -ENOMEM;
1806
1807 path_simplify(x + 6, true);
1808 our_env[n_env++] = x;
1809 }
1810
1811 if (!sd_id128_is_null(u->invocation_id)) {
1812 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1813 return -ENOMEM;
1814
1815 our_env[n_env++] = x;
1816 }
1817
1818 if (exec_context_needs_term(c)) {
1819 const char *tty_path, *term = NULL;
1820
1821 tty_path = exec_context_tty_path(c);
1822
1823 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1824 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1825 * passes to PID 1 ends up all the way in the console login shown. */
1826
1827 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1828 term = getenv("TERM");
1829 if (!term)
1830 term = default_term_for_tty(tty_path);
1831
1832 x = strjoin("TERM=", term);
1833 if (!x)
1834 return -ENOMEM;
1835 our_env[n_env++] = x;
1836 }
1837
1838 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1839 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1840 return -ENOMEM;
1841
1842 our_env[n_env++] = x;
1843 }
1844
1845 if (c->log_namespace) {
1846 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1847 if (!x)
1848 return -ENOMEM;
1849
1850 our_env[n_env++] = x;
1851 }
1852
1853 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1854 _cleanup_free_ char *pre = NULL, *joined = NULL;
1855 const char *n;
1856
1857 if (!p->prefix[t])
1858 continue;
1859
1860 if (strv_isempty(c->directories[t].paths))
1861 continue;
1862
1863 n = exec_directory_env_name_to_string(t);
1864 if (!n)
1865 continue;
1866
1867 pre = strjoin(p->prefix[t], "/");
1868 if (!pre)
1869 return -ENOMEM;
1870
1871 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1872 if (!joined)
1873 return -ENOMEM;
1874
1875 x = strjoin(n, "=", joined);
1876 if (!x)
1877 return -ENOMEM;
1878
1879 our_env[n_env++] = x;
1880 }
1881
1882 our_env[n_env++] = NULL;
1883 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1884
1885 *ret = TAKE_PTR(our_env);
1886
1887 return 0;
1888 }
1889
1890 static int build_pass_environment(const ExecContext *c, char ***ret) {
1891 _cleanup_strv_free_ char **pass_env = NULL;
1892 size_t n_env = 0, n_bufsize = 0;
1893 char **i;
1894
1895 STRV_FOREACH(i, c->pass_environment) {
1896 _cleanup_free_ char *x = NULL;
1897 char *v;
1898
1899 v = getenv(*i);
1900 if (!v)
1901 continue;
1902 x = strjoin(*i, "=", v);
1903 if (!x)
1904 return -ENOMEM;
1905
1906 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1907 return -ENOMEM;
1908
1909 pass_env[n_env++] = TAKE_PTR(x);
1910 pass_env[n_env] = NULL;
1911 }
1912
1913 *ret = TAKE_PTR(pass_env);
1914
1915 return 0;
1916 }
1917
1918 static bool exec_needs_mount_namespace(
1919 const ExecContext *context,
1920 const ExecParameters *params,
1921 const ExecRuntime *runtime) {
1922
1923 assert(context);
1924 assert(params);
1925
1926 if (context->root_image)
1927 return true;
1928
1929 if (!strv_isempty(context->read_write_paths) ||
1930 !strv_isempty(context->read_only_paths) ||
1931 !strv_isempty(context->inaccessible_paths))
1932 return true;
1933
1934 if (context->n_bind_mounts > 0)
1935 return true;
1936
1937 if (context->n_temporary_filesystems > 0)
1938 return true;
1939
1940 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1941 return true;
1942
1943 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1944 return true;
1945
1946 if (context->private_devices ||
1947 context->private_mounts ||
1948 context->protect_system != PROTECT_SYSTEM_NO ||
1949 context->protect_home != PROTECT_HOME_NO ||
1950 context->protect_kernel_tunables ||
1951 context->protect_kernel_modules ||
1952 context->protect_kernel_logs ||
1953 context->protect_control_groups)
1954 return true;
1955
1956 if (context->root_directory) {
1957 ExecDirectoryType t;
1958
1959 if (context->mount_apivfs)
1960 return true;
1961
1962 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1963 if (!params->prefix[t])
1964 continue;
1965
1966 if (!strv_isempty(context->directories[t].paths))
1967 return true;
1968 }
1969 }
1970
1971 if (context->dynamic_user &&
1972 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1973 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1974 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1975 return true;
1976
1977 if (context->log_namespace)
1978 return true;
1979
1980 return false;
1981 }
1982
1983 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1984 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1985 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1986 _cleanup_close_ int unshare_ready_fd = -1;
1987 _cleanup_(sigkill_waitp) pid_t pid = 0;
1988 uint64_t c = 1;
1989 ssize_t n;
1990 int r;
1991
1992 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1993 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1994 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1995 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1996 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1997 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1998 * continues execution normally.
1999 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2000 * does not need CAP_SETUID to write the single line mapping to itself. */
2001
2002 /* Can only set up multiple mappings with CAP_SETUID. */
2003 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2004 r = asprintf(&uid_map,
2005 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2006 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2007 ouid, ouid, uid, uid);
2008 else
2009 r = asprintf(&uid_map,
2010 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2011 ouid, ouid);
2012
2013 if (r < 0)
2014 return -ENOMEM;
2015
2016 /* Can only set up multiple mappings with CAP_SETGID. */
2017 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2018 r = asprintf(&gid_map,
2019 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2020 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2021 ogid, ogid, gid, gid);
2022 else
2023 r = asprintf(&gid_map,
2024 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2025 ogid, ogid);
2026
2027 if (r < 0)
2028 return -ENOMEM;
2029
2030 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2031 * namespace. */
2032 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2033 if (unshare_ready_fd < 0)
2034 return -errno;
2035
2036 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2037 * failed. */
2038 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2039 return -errno;
2040
2041 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2042 if (r < 0)
2043 return r;
2044 if (r == 0) {
2045 _cleanup_close_ int fd = -1;
2046 const char *a;
2047 pid_t ppid;
2048
2049 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2050 * here, after the parent opened its own user namespace. */
2051
2052 ppid = getppid();
2053 errno_pipe[0] = safe_close(errno_pipe[0]);
2054
2055 /* Wait until the parent unshared the user namespace */
2056 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2057 r = -errno;
2058 goto child_fail;
2059 }
2060
2061 /* Disable the setgroups() system call in the child user namespace, for good. */
2062 a = procfs_file_alloca(ppid, "setgroups");
2063 fd = open(a, O_WRONLY|O_CLOEXEC);
2064 if (fd < 0) {
2065 if (errno != ENOENT) {
2066 r = -errno;
2067 goto child_fail;
2068 }
2069
2070 /* If the file is missing the kernel is too old, let's continue anyway. */
2071 } else {
2072 if (write(fd, "deny\n", 5) < 0) {
2073 r = -errno;
2074 goto child_fail;
2075 }
2076
2077 fd = safe_close(fd);
2078 }
2079
2080 /* First write the GID map */
2081 a = procfs_file_alloca(ppid, "gid_map");
2082 fd = open(a, O_WRONLY|O_CLOEXEC);
2083 if (fd < 0) {
2084 r = -errno;
2085 goto child_fail;
2086 }
2087 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2088 r = -errno;
2089 goto child_fail;
2090 }
2091 fd = safe_close(fd);
2092
2093 /* The write the UID map */
2094 a = procfs_file_alloca(ppid, "uid_map");
2095 fd = open(a, O_WRONLY|O_CLOEXEC);
2096 if (fd < 0) {
2097 r = -errno;
2098 goto child_fail;
2099 }
2100 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2101 r = -errno;
2102 goto child_fail;
2103 }
2104
2105 _exit(EXIT_SUCCESS);
2106
2107 child_fail:
2108 (void) write(errno_pipe[1], &r, sizeof(r));
2109 _exit(EXIT_FAILURE);
2110 }
2111
2112 errno_pipe[1] = safe_close(errno_pipe[1]);
2113
2114 if (unshare(CLONE_NEWUSER) < 0)
2115 return -errno;
2116
2117 /* Let the child know that the namespace is ready now */
2118 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2119 return -errno;
2120
2121 /* Try to read an error code from the child */
2122 n = read(errno_pipe[0], &r, sizeof(r));
2123 if (n < 0)
2124 return -errno;
2125 if (n == sizeof(r)) { /* an error code was sent to us */
2126 if (r < 0)
2127 return r;
2128 return -EIO;
2129 }
2130 if (n != 0) /* on success we should have read 0 bytes */
2131 return -EIO;
2132
2133 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2134 pid = 0;
2135 if (r < 0)
2136 return r;
2137 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2138 return -EIO;
2139
2140 return 0;
2141 }
2142
2143 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2144 if (!context->dynamic_user)
2145 return false;
2146
2147 if (type == EXEC_DIRECTORY_CONFIGURATION)
2148 return false;
2149
2150 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2151 return false;
2152
2153 return true;
2154 }
2155
2156 static int setup_exec_directory(
2157 const ExecContext *context,
2158 const ExecParameters *params,
2159 uid_t uid,
2160 gid_t gid,
2161 ExecDirectoryType type,
2162 int *exit_status) {
2163
2164 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2165 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2166 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2167 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2168 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2169 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2170 };
2171 char **rt;
2172 int r;
2173
2174 assert(context);
2175 assert(params);
2176 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2177 assert(exit_status);
2178
2179 if (!params->prefix[type])
2180 return 0;
2181
2182 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2183 if (!uid_is_valid(uid))
2184 uid = 0;
2185 if (!gid_is_valid(gid))
2186 gid = 0;
2187 }
2188
2189 STRV_FOREACH(rt, context->directories[type].paths) {
2190 _cleanup_free_ char *p = NULL, *pp = NULL;
2191
2192 p = path_join(params->prefix[type], *rt);
2193 if (!p) {
2194 r = -ENOMEM;
2195 goto fail;
2196 }
2197
2198 r = mkdir_parents_label(p, 0755);
2199 if (r < 0)
2200 goto fail;
2201
2202 if (exec_directory_is_private(context, type)) {
2203 _cleanup_free_ char *private_root = NULL;
2204
2205 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2206 * case we want to avoid leaving a directory around fully accessible that is owned by
2207 * a dynamic user whose UID is later on reused. To lock this down we use the same
2208 * trick used by container managers to prohibit host users to get access to files of
2209 * the same UID in containers: we place everything inside a directory that has an
2210 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2211 * for unprivileged host code. We then use fs namespacing to make this directory
2212 * permeable for the service itself.
2213 *
2214 * Specifically: for a service which wants a special directory "foo/" we first create
2215 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2216 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2217 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2218 * unprivileged host users can't look into it. Inside of the namespace of the unit
2219 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2220 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2221 * for the service and making sure it only gets access to the dirs it needs but no
2222 * others. Tricky? Yes, absolutely, but it works!
2223 *
2224 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2225 * to be owned by the service itself.
2226 *
2227 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2228 * for sharing files or sockets with other services. */
2229
2230 private_root = path_join(params->prefix[type], "private");
2231 if (!private_root) {
2232 r = -ENOMEM;
2233 goto fail;
2234 }
2235
2236 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2237 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2238 if (r < 0)
2239 goto fail;
2240
2241 pp = path_join(private_root, *rt);
2242 if (!pp) {
2243 r = -ENOMEM;
2244 goto fail;
2245 }
2246
2247 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2248 r = mkdir_parents_label(pp, 0755);
2249 if (r < 0)
2250 goto fail;
2251
2252 if (is_dir(p, false) > 0 &&
2253 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2254
2255 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2256 * it over. Most likely the service has been upgraded from one that didn't use
2257 * DynamicUser=1, to one that does. */
2258
2259 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2260 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2261 exec_directory_type_to_string(type), p, pp);
2262
2263 if (rename(p, pp) < 0) {
2264 r = -errno;
2265 goto fail;
2266 }
2267 } else {
2268 /* Otherwise, create the actual directory for the service */
2269
2270 r = mkdir_label(pp, context->directories[type].mode);
2271 if (r < 0 && r != -EEXIST)
2272 goto fail;
2273 }
2274
2275 /* And link it up from the original place */
2276 r = symlink_idempotent(pp, p, true);
2277 if (r < 0)
2278 goto fail;
2279
2280 } else {
2281 _cleanup_free_ char *target = NULL;
2282
2283 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2284 readlink_and_make_absolute(p, &target) >= 0) {
2285 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2286
2287 /* This already exists and is a symlink? Interesting. Maybe it's one created
2288 * by DynamicUser=1 (see above)?
2289 *
2290 * We do this for all directory types except for ConfigurationDirectory=,
2291 * since they all support the private/ symlink logic at least in some
2292 * configurations, see above. */
2293
2294 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2295 if (r < 0)
2296 goto fail;
2297
2298 q = path_join(params->prefix[type], "private", *rt);
2299 if (!q) {
2300 r = -ENOMEM;
2301 goto fail;
2302 }
2303
2304 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2305 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2306 if (r < 0)
2307 goto fail;
2308
2309 if (path_equal(q_resolved, target_resolved)) {
2310
2311 /* Hmm, apparently DynamicUser= was once turned on for this service,
2312 * but is no longer. Let's move the directory back up. */
2313
2314 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2315 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2316 exec_directory_type_to_string(type), q, p);
2317
2318 if (unlink(p) < 0) {
2319 r = -errno;
2320 goto fail;
2321 }
2322
2323 if (rename(q, p) < 0) {
2324 r = -errno;
2325 goto fail;
2326 }
2327 }
2328 }
2329
2330 r = mkdir_label(p, context->directories[type].mode);
2331 if (r < 0) {
2332 if (r != -EEXIST)
2333 goto fail;
2334
2335 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2336 struct stat st;
2337
2338 /* Don't change the owner/access mode of the configuration directory,
2339 * as in the common case it is not written to by a service, and shall
2340 * not be writable. */
2341
2342 if (stat(p, &st) < 0) {
2343 r = -errno;
2344 goto fail;
2345 }
2346
2347 /* Still complain if the access mode doesn't match */
2348 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2349 log_warning("%s \'%s\' already exists but the mode is different. "
2350 "(File system: %o %sMode: %o)",
2351 exec_directory_type_to_string(type), *rt,
2352 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2353
2354 continue;
2355 }
2356 }
2357 }
2358
2359 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2360 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2361 * current UID/GID ownership.) */
2362 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2363 if (r < 0)
2364 goto fail;
2365
2366 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2367 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2368 * assignments to exist.*/
2369 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2370 if (r < 0)
2371 goto fail;
2372 }
2373
2374 return 0;
2375
2376 fail:
2377 *exit_status = exit_status_table[type];
2378 return r;
2379 }
2380
2381 #if ENABLE_SMACK
2382 static int setup_smack(
2383 const ExecContext *context,
2384 const ExecCommand *command) {
2385
2386 int r;
2387
2388 assert(context);
2389 assert(command);
2390
2391 if (context->smack_process_label) {
2392 r = mac_smack_apply_pid(0, context->smack_process_label);
2393 if (r < 0)
2394 return r;
2395 }
2396 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2397 else {
2398 _cleanup_free_ char *exec_label = NULL;
2399
2400 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2401 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2402 return r;
2403
2404 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2405 if (r < 0)
2406 return r;
2407 }
2408 #endif
2409
2410 return 0;
2411 }
2412 #endif
2413
2414 static int compile_bind_mounts(
2415 const ExecContext *context,
2416 const ExecParameters *params,
2417 BindMount **ret_bind_mounts,
2418 size_t *ret_n_bind_mounts,
2419 char ***ret_empty_directories) {
2420
2421 _cleanup_strv_free_ char **empty_directories = NULL;
2422 BindMount *bind_mounts;
2423 size_t n, h = 0, i;
2424 ExecDirectoryType t;
2425 int r;
2426
2427 assert(context);
2428 assert(params);
2429 assert(ret_bind_mounts);
2430 assert(ret_n_bind_mounts);
2431 assert(ret_empty_directories);
2432
2433 n = context->n_bind_mounts;
2434 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2435 if (!params->prefix[t])
2436 continue;
2437
2438 n += strv_length(context->directories[t].paths);
2439 }
2440
2441 if (n <= 0) {
2442 *ret_bind_mounts = NULL;
2443 *ret_n_bind_mounts = 0;
2444 *ret_empty_directories = NULL;
2445 return 0;
2446 }
2447
2448 bind_mounts = new(BindMount, n);
2449 if (!bind_mounts)
2450 return -ENOMEM;
2451
2452 for (i = 0; i < context->n_bind_mounts; i++) {
2453 BindMount *item = context->bind_mounts + i;
2454 char *s, *d;
2455
2456 s = strdup(item->source);
2457 if (!s) {
2458 r = -ENOMEM;
2459 goto finish;
2460 }
2461
2462 d = strdup(item->destination);
2463 if (!d) {
2464 free(s);
2465 r = -ENOMEM;
2466 goto finish;
2467 }
2468
2469 bind_mounts[h++] = (BindMount) {
2470 .source = s,
2471 .destination = d,
2472 .read_only = item->read_only,
2473 .recursive = item->recursive,
2474 .ignore_enoent = item->ignore_enoent,
2475 };
2476 }
2477
2478 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2479 char **suffix;
2480
2481 if (!params->prefix[t])
2482 continue;
2483
2484 if (strv_isempty(context->directories[t].paths))
2485 continue;
2486
2487 if (exec_directory_is_private(context, t) &&
2488 !(context->root_directory || context->root_image)) {
2489 char *private_root;
2490
2491 /* So this is for a dynamic user, and we need to make sure the process can access its own
2492 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2493 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2494
2495 private_root = path_join(params->prefix[t], "private");
2496 if (!private_root) {
2497 r = -ENOMEM;
2498 goto finish;
2499 }
2500
2501 r = strv_consume(&empty_directories, private_root);
2502 if (r < 0)
2503 goto finish;
2504 }
2505
2506 STRV_FOREACH(suffix, context->directories[t].paths) {
2507 char *s, *d;
2508
2509 if (exec_directory_is_private(context, t))
2510 s = path_join(params->prefix[t], "private", *suffix);
2511 else
2512 s = path_join(params->prefix[t], *suffix);
2513 if (!s) {
2514 r = -ENOMEM;
2515 goto finish;
2516 }
2517
2518 if (exec_directory_is_private(context, t) &&
2519 (context->root_directory || context->root_image))
2520 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2521 * directory is not created on the root directory. So, let's bind-mount the directory
2522 * on the 'non-private' place. */
2523 d = path_join(params->prefix[t], *suffix);
2524 else
2525 d = strdup(s);
2526 if (!d) {
2527 free(s);
2528 r = -ENOMEM;
2529 goto finish;
2530 }
2531
2532 bind_mounts[h++] = (BindMount) {
2533 .source = s,
2534 .destination = d,
2535 .read_only = false,
2536 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2537 .recursive = true,
2538 .ignore_enoent = false,
2539 };
2540 }
2541 }
2542
2543 assert(h == n);
2544
2545 *ret_bind_mounts = bind_mounts;
2546 *ret_n_bind_mounts = n;
2547 *ret_empty_directories = TAKE_PTR(empty_directories);
2548
2549 return (int) n;
2550
2551 finish:
2552 bind_mount_free_many(bind_mounts, h);
2553 return r;
2554 }
2555
2556 static bool insist_on_sandboxing(
2557 const ExecContext *context,
2558 const char *root_dir,
2559 const char *root_image,
2560 const BindMount *bind_mounts,
2561 size_t n_bind_mounts) {
2562
2563 size_t i;
2564
2565 assert(context);
2566 assert(n_bind_mounts == 0 || bind_mounts);
2567
2568 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2569 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2570 * rearrange stuff in a way we cannot ignore gracefully. */
2571
2572 if (context->n_temporary_filesystems > 0)
2573 return true;
2574
2575 if (root_dir || root_image)
2576 return true;
2577
2578 if (context->dynamic_user)
2579 return true;
2580
2581 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2582 * essential. */
2583 for (i = 0; i < n_bind_mounts; i++)
2584 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2585 return true;
2586
2587 if (context->log_namespace)
2588 return true;
2589
2590 return false;
2591 }
2592
2593 static int apply_mount_namespace(
2594 const Unit *u,
2595 const ExecCommand *command,
2596 const ExecContext *context,
2597 const ExecParameters *params,
2598 const ExecRuntime *runtime,
2599 char **error_path) {
2600
2601 _cleanup_strv_free_ char **empty_directories = NULL;
2602 char *tmp = NULL, *var = NULL;
2603 const char *root_dir = NULL, *root_image = NULL;
2604 NamespaceInfo ns_info;
2605 bool needs_sandboxing;
2606 BindMount *bind_mounts = NULL;
2607 size_t n_bind_mounts = 0;
2608 int r;
2609
2610 assert(context);
2611
2612 if (params->flags & EXEC_APPLY_CHROOT) {
2613 root_image = context->root_image;
2614
2615 if (!root_image)
2616 root_dir = context->root_directory;
2617 }
2618
2619 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2620 if (r < 0)
2621 return r;
2622
2623 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2624 if (needs_sandboxing) {
2625 /* The runtime struct only contains the parent of the private /tmp,
2626 * which is non-accessible to world users. Inside of it there's a /tmp
2627 * that is sticky, and that's the one we want to use here. */
2628
2629 if (context->private_tmp && runtime) {
2630 if (runtime->tmp_dir)
2631 tmp = strjoina(runtime->tmp_dir, "/tmp");
2632 if (runtime->var_tmp_dir)
2633 var = strjoina(runtime->var_tmp_dir, "/tmp");
2634 }
2635
2636 ns_info = (NamespaceInfo) {
2637 .ignore_protect_paths = false,
2638 .private_dev = context->private_devices,
2639 .protect_control_groups = context->protect_control_groups,
2640 .protect_kernel_tunables = context->protect_kernel_tunables,
2641 .protect_kernel_modules = context->protect_kernel_modules,
2642 .protect_kernel_logs = context->protect_kernel_logs,
2643 .protect_hostname = context->protect_hostname,
2644 .mount_apivfs = context->mount_apivfs,
2645 .private_mounts = context->private_mounts,
2646 };
2647 } else if (!context->dynamic_user && root_dir)
2648 /*
2649 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2650 * sandbox info, otherwise enforce it, don't ignore protected paths and
2651 * fail if we are enable to apply the sandbox inside the mount namespace.
2652 */
2653 ns_info = (NamespaceInfo) {
2654 .ignore_protect_paths = true,
2655 };
2656 else
2657 ns_info = (NamespaceInfo) {};
2658
2659 if (context->mount_flags == MS_SHARED)
2660 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2661
2662 r = setup_namespace(root_dir, root_image,
2663 &ns_info, context->read_write_paths,
2664 needs_sandboxing ? context->read_only_paths : NULL,
2665 needs_sandboxing ? context->inaccessible_paths : NULL,
2666 empty_directories,
2667 bind_mounts,
2668 n_bind_mounts,
2669 context->temporary_filesystems,
2670 context->n_temporary_filesystems,
2671 tmp,
2672 var,
2673 context->log_namespace,
2674 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2675 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2676 context->mount_flags,
2677 DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2678 error_path);
2679
2680 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2681 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2682 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2683 * completely different execution environment. */
2684 if (r == -ENOANO) {
2685 if (insist_on_sandboxing(
2686 context,
2687 root_dir, root_image,
2688 bind_mounts,
2689 n_bind_mounts)) {
2690 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2691 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2692 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2693
2694 r = -EOPNOTSUPP;
2695 } else {
2696 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2697 r = 0;
2698 }
2699 }
2700
2701 bind_mount_free_many(bind_mounts, n_bind_mounts);
2702 return r;
2703 }
2704
2705 static int apply_working_directory(
2706 const ExecContext *context,
2707 const ExecParameters *params,
2708 const char *home,
2709 int *exit_status) {
2710
2711 const char *d, *wd;
2712
2713 assert(context);
2714 assert(exit_status);
2715
2716 if (context->working_directory_home) {
2717
2718 if (!home) {
2719 *exit_status = EXIT_CHDIR;
2720 return -ENXIO;
2721 }
2722
2723 wd = home;
2724
2725 } else if (context->working_directory)
2726 wd = context->working_directory;
2727 else
2728 wd = "/";
2729
2730 if (params->flags & EXEC_APPLY_CHROOT)
2731 d = wd;
2732 else
2733 d = prefix_roota(context->root_directory, wd);
2734
2735 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2736 *exit_status = EXIT_CHDIR;
2737 return -errno;
2738 }
2739
2740 return 0;
2741 }
2742
2743 static int apply_root_directory(
2744 const ExecContext *context,
2745 const ExecParameters *params,
2746 const bool needs_mount_ns,
2747 int *exit_status) {
2748
2749 assert(context);
2750 assert(exit_status);
2751
2752 if (params->flags & EXEC_APPLY_CHROOT) {
2753 if (!needs_mount_ns && context->root_directory)
2754 if (chroot(context->root_directory) < 0) {
2755 *exit_status = EXIT_CHROOT;
2756 return -errno;
2757 }
2758 }
2759
2760 return 0;
2761 }
2762
2763 static int setup_keyring(
2764 const Unit *u,
2765 const ExecContext *context,
2766 const ExecParameters *p,
2767 uid_t uid, gid_t gid) {
2768
2769 key_serial_t keyring;
2770 int r = 0;
2771 uid_t saved_uid;
2772 gid_t saved_gid;
2773
2774 assert(u);
2775 assert(context);
2776 assert(p);
2777
2778 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2779 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2780 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2781 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2782 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2783 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2784
2785 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2786 return 0;
2787
2788 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2789 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2790 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2791 * & group is just as nasty as acquiring a reference to the user keyring. */
2792
2793 saved_uid = getuid();
2794 saved_gid = getgid();
2795
2796 if (gid_is_valid(gid) && gid != saved_gid) {
2797 if (setregid(gid, -1) < 0)
2798 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2799 }
2800
2801 if (uid_is_valid(uid) && uid != saved_uid) {
2802 if (setreuid(uid, -1) < 0) {
2803 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2804 goto out;
2805 }
2806 }
2807
2808 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2809 if (keyring == -1) {
2810 if (errno == ENOSYS)
2811 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2812 else if (IN_SET(errno, EACCES, EPERM))
2813 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2814 else if (errno == EDQUOT)
2815 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2816 else
2817 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2818
2819 goto out;
2820 }
2821
2822 /* When requested link the user keyring into the session keyring. */
2823 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2824
2825 if (keyctl(KEYCTL_LINK,
2826 KEY_SPEC_USER_KEYRING,
2827 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2828 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2829 goto out;
2830 }
2831 }
2832
2833 /* Restore uid/gid back */
2834 if (uid_is_valid(uid) && uid != saved_uid) {
2835 if (setreuid(saved_uid, -1) < 0) {
2836 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2837 goto out;
2838 }
2839 }
2840
2841 if (gid_is_valid(gid) && gid != saved_gid) {
2842 if (setregid(saved_gid, -1) < 0)
2843 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2844 }
2845
2846 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2847 if (!sd_id128_is_null(u->invocation_id)) {
2848 key_serial_t key;
2849
2850 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2851 if (key == -1)
2852 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2853 else {
2854 if (keyctl(KEYCTL_SETPERM, key,
2855 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2856 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2857 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2858 }
2859 }
2860
2861 out:
2862 /* Revert back uid & gid for the the last time, and exit */
2863 /* no extra logging, as only the first already reported error matters */
2864 if (getuid() != saved_uid)
2865 (void) setreuid(saved_uid, -1);
2866
2867 if (getgid() != saved_gid)
2868 (void) setregid(saved_gid, -1);
2869
2870 return r;
2871 }
2872
2873 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2874 assert(array);
2875 assert(n);
2876 assert(pair);
2877
2878 if (pair[0] >= 0)
2879 array[(*n)++] = pair[0];
2880 if (pair[1] >= 0)
2881 array[(*n)++] = pair[1];
2882 }
2883
2884 static int close_remaining_fds(
2885 const ExecParameters *params,
2886 const ExecRuntime *runtime,
2887 const DynamicCreds *dcreds,
2888 int user_lookup_fd,
2889 int socket_fd,
2890 int exec_fd,
2891 const int *fds, size_t n_fds) {
2892
2893 size_t n_dont_close = 0;
2894 int dont_close[n_fds + 12];
2895
2896 assert(params);
2897
2898 if (params->stdin_fd >= 0)
2899 dont_close[n_dont_close++] = params->stdin_fd;
2900 if (params->stdout_fd >= 0)
2901 dont_close[n_dont_close++] = params->stdout_fd;
2902 if (params->stderr_fd >= 0)
2903 dont_close[n_dont_close++] = params->stderr_fd;
2904
2905 if (socket_fd >= 0)
2906 dont_close[n_dont_close++] = socket_fd;
2907 if (exec_fd >= 0)
2908 dont_close[n_dont_close++] = exec_fd;
2909 if (n_fds > 0) {
2910 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2911 n_dont_close += n_fds;
2912 }
2913
2914 if (runtime)
2915 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2916
2917 if (dcreds) {
2918 if (dcreds->user)
2919 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2920 if (dcreds->group)
2921 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2922 }
2923
2924 if (user_lookup_fd >= 0)
2925 dont_close[n_dont_close++] = user_lookup_fd;
2926
2927 return close_all_fds(dont_close, n_dont_close);
2928 }
2929
2930 static int send_user_lookup(
2931 Unit *unit,
2932 int user_lookup_fd,
2933 uid_t uid,
2934 gid_t gid) {
2935
2936 assert(unit);
2937
2938 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2939 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2940 * specified. */
2941
2942 if (user_lookup_fd < 0)
2943 return 0;
2944
2945 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2946 return 0;
2947
2948 if (writev(user_lookup_fd,
2949 (struct iovec[]) {
2950 IOVEC_INIT(&uid, sizeof(uid)),
2951 IOVEC_INIT(&gid, sizeof(gid)),
2952 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2953 return -errno;
2954
2955 return 0;
2956 }
2957
2958 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2959 int r;
2960
2961 assert(c);
2962 assert(home);
2963 assert(buf);
2964
2965 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2966
2967 if (*home)
2968 return 0;
2969
2970 if (!c->working_directory_home)
2971 return 0;
2972
2973 r = get_home_dir(buf);
2974 if (r < 0)
2975 return r;
2976
2977 *home = *buf;
2978 return 1;
2979 }
2980
2981 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2982 _cleanup_strv_free_ char ** list = NULL;
2983 ExecDirectoryType t;
2984 int r;
2985
2986 assert(c);
2987 assert(p);
2988 assert(ret);
2989
2990 assert(c->dynamic_user);
2991
2992 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2993 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2994 * directories. */
2995
2996 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2997 char **i;
2998
2999 if (t == EXEC_DIRECTORY_CONFIGURATION)
3000 continue;
3001
3002 if (!p->prefix[t])
3003 continue;
3004
3005 STRV_FOREACH(i, c->directories[t].paths) {
3006 char *e;
3007
3008 if (exec_directory_is_private(c, t))
3009 e = path_join(p->prefix[t], "private", *i);
3010 else
3011 e = path_join(p->prefix[t], *i);
3012 if (!e)
3013 return -ENOMEM;
3014
3015 r = strv_consume(&list, e);
3016 if (r < 0)
3017 return r;
3018 }
3019 }
3020
3021 *ret = TAKE_PTR(list);
3022
3023 return 0;
3024 }
3025
3026 static char *exec_command_line(char **argv);
3027
3028 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3029 bool using_subcgroup;
3030 char *p;
3031
3032 assert(params);
3033 assert(ret);
3034
3035 if (!params->cgroup_path)
3036 return -EINVAL;
3037
3038 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3039 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3040 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3041 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3042 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3043 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3044 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3045 * flag, which is only passed for the former statements, not for the latter. */
3046
3047 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3048 if (using_subcgroup)
3049 p = path_join(params->cgroup_path, ".control");
3050 else
3051 p = strdup(params->cgroup_path);
3052 if (!p)
3053 return -ENOMEM;
3054
3055 *ret = p;
3056 return using_subcgroup;
3057 }
3058
3059 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3060 _cleanup_(cpu_set_reset) CPUSet s = {};
3061 int r;
3062
3063 assert(c);
3064 assert(ret);
3065
3066 if (!c->numa_policy.nodes.set) {
3067 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3068 return 0;
3069 }
3070
3071 r = numa_to_cpu_set(&c->numa_policy, &s);
3072 if (r < 0)
3073 return r;
3074
3075 cpu_set_reset(ret);
3076
3077 return cpu_set_add_all(ret, &s);
3078 }
3079
3080 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3081 assert(c);
3082
3083 return c->cpu_affinity_from_numa;
3084 }
3085
3086 static int exec_child(
3087 Unit *unit,
3088 const ExecCommand *command,
3089 const ExecContext *context,
3090 const ExecParameters *params,
3091 ExecRuntime *runtime,
3092 DynamicCreds *dcreds,
3093 int socket_fd,
3094 const int named_iofds[static 3],
3095 int *fds,
3096 size_t n_socket_fds,
3097 size_t n_storage_fds,
3098 char **files_env,
3099 int user_lookup_fd,
3100 int *exit_status) {
3101
3102 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3103 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3104 _cleanup_free_ gid_t *supplementary_gids = NULL;
3105 const char *username = NULL, *groupname = NULL;
3106 _cleanup_free_ char *home_buffer = NULL;
3107 const char *home = NULL, *shell = NULL;
3108 char **final_argv = NULL;
3109 dev_t journal_stream_dev = 0;
3110 ino_t journal_stream_ino = 0;
3111 bool userns_set_up = false;
3112 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3113 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3114 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3115 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3116 #if HAVE_SELINUX
3117 _cleanup_free_ char *mac_selinux_context_net = NULL;
3118 bool use_selinux = false;
3119 #endif
3120 #if ENABLE_SMACK
3121 bool use_smack = false;
3122 #endif
3123 #if HAVE_APPARMOR
3124 bool use_apparmor = false;
3125 #endif
3126 uid_t saved_uid = getuid();
3127 gid_t saved_gid = getgid();
3128 uid_t uid = UID_INVALID;
3129 gid_t gid = GID_INVALID;
3130 size_t n_fds;
3131 ExecDirectoryType dt;
3132 int secure_bits;
3133 _cleanup_free_ gid_t *gids_after_pam = NULL;
3134 int ngids_after_pam = 0;
3135
3136 assert(unit);
3137 assert(command);
3138 assert(context);
3139 assert(params);
3140 assert(exit_status);
3141
3142 rename_process_from_path(command->path);
3143
3144 /* We reset exactly these signals, since they are the
3145 * only ones we set to SIG_IGN in the main daemon. All
3146 * others we leave untouched because we set them to
3147 * SIG_DFL or a valid handler initially, both of which
3148 * will be demoted to SIG_DFL. */
3149 (void) default_signals(SIGNALS_CRASH_HANDLER,
3150 SIGNALS_IGNORE, -1);
3151
3152 if (context->ignore_sigpipe)
3153 (void) ignore_signals(SIGPIPE, -1);
3154
3155 r = reset_signal_mask();
3156 if (r < 0) {
3157 *exit_status = EXIT_SIGNAL_MASK;
3158 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3159 }
3160
3161 if (params->idle_pipe)
3162 do_idle_pipe_dance(params->idle_pipe);
3163
3164 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3165 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3166 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3167 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3168
3169 log_forget_fds();
3170 log_set_open_when_needed(true);
3171
3172 /* In case anything used libc syslog(), close this here, too */
3173 closelog();
3174
3175 n_fds = n_socket_fds + n_storage_fds;
3176 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3177 if (r < 0) {
3178 *exit_status = EXIT_FDS;
3179 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3180 }
3181
3182 if (!context->same_pgrp)
3183 if (setsid() < 0) {
3184 *exit_status = EXIT_SETSID;
3185 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3186 }
3187
3188 exec_context_tty_reset(context, params);
3189
3190 if (unit_shall_confirm_spawn(unit)) {
3191 const char *vc = params->confirm_spawn;
3192 _cleanup_free_ char *cmdline = NULL;
3193
3194 cmdline = exec_command_line(command->argv);
3195 if (!cmdline) {
3196 *exit_status = EXIT_MEMORY;
3197 return log_oom();
3198 }
3199
3200 r = ask_for_confirmation(vc, unit, cmdline);
3201 if (r != CONFIRM_EXECUTE) {
3202 if (r == CONFIRM_PRETEND_SUCCESS) {
3203 *exit_status = EXIT_SUCCESS;
3204 return 0;
3205 }
3206 *exit_status = EXIT_CONFIRM;
3207 log_unit_error(unit, "Execution cancelled by the user");
3208 return -ECANCELED;
3209 }
3210 }
3211
3212 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3213 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3214 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3215 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3216 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3217 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3218 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3219 *exit_status = EXIT_MEMORY;
3220 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3221 }
3222
3223 if (context->dynamic_user && dcreds) {
3224 _cleanup_strv_free_ char **suggested_paths = NULL;
3225
3226 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3227 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3228 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3229 *exit_status = EXIT_USER;
3230 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3231 }
3232
3233 r = compile_suggested_paths(context, params, &suggested_paths);
3234 if (r < 0) {
3235 *exit_status = EXIT_MEMORY;
3236 return log_oom();
3237 }
3238
3239 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3240 if (r < 0) {
3241 *exit_status = EXIT_USER;
3242 if (r == -EILSEQ) {
3243 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3244 return -EOPNOTSUPP;
3245 }
3246 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3247 }
3248
3249 if (!uid_is_valid(uid)) {
3250 *exit_status = EXIT_USER;
3251 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3252 return -ESRCH;
3253 }
3254
3255 if (!gid_is_valid(gid)) {
3256 *exit_status = EXIT_USER;
3257 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3258 return -ESRCH;
3259 }
3260
3261 if (dcreds->user)
3262 username = dcreds->user->name;
3263
3264 } else {
3265 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3266 if (r < 0) {
3267 *exit_status = EXIT_USER;
3268 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3269 }
3270
3271 r = get_fixed_group(context, &groupname, &gid);
3272 if (r < 0) {
3273 *exit_status = EXIT_GROUP;
3274 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3275 }
3276 }
3277
3278 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3279 r = get_supplementary_groups(context, username, groupname, gid,
3280 &supplementary_gids, &ngids);
3281 if (r < 0) {
3282 *exit_status = EXIT_GROUP;
3283 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3284 }
3285
3286 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3287 if (r < 0) {
3288 *exit_status = EXIT_USER;
3289 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3290 }
3291
3292 user_lookup_fd = safe_close(user_lookup_fd);
3293
3294 r = acquire_home(context, uid, &home, &home_buffer);
3295 if (r < 0) {
3296 *exit_status = EXIT_CHDIR;
3297 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3298 }
3299
3300 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3301 * must sure to drop O_NONBLOCK */
3302 if (socket_fd >= 0)
3303 (void) fd_nonblock(socket_fd, false);
3304
3305 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3306 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3307 if (params->cgroup_path) {
3308 _cleanup_free_ char *p = NULL;
3309
3310 r = exec_parameters_get_cgroup_path(params, &p);
3311 if (r < 0) {
3312 *exit_status = EXIT_CGROUP;
3313 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3314 }
3315
3316 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3317 if (r < 0) {
3318 *exit_status = EXIT_CGROUP;
3319 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3320 }
3321 }
3322
3323 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3324 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3325 if (r < 0) {
3326 *exit_status = EXIT_NETWORK;
3327 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3328 }
3329 }
3330
3331 r = setup_input(context, params, socket_fd, named_iofds);
3332 if (r < 0) {
3333 *exit_status = EXIT_STDIN;
3334 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3335 }
3336
3337 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3338 if (r < 0) {
3339 *exit_status = EXIT_STDOUT;
3340 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3341 }
3342
3343 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3344 if (r < 0) {
3345 *exit_status = EXIT_STDERR;
3346 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3347 }
3348
3349 if (context->oom_score_adjust_set) {
3350 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3351 * prohibit write access to this file, and we shouldn't trip up over that. */
3352 r = set_oom_score_adjust(context->oom_score_adjust);
3353 if (IN_SET(r, -EPERM, -EACCES))
3354 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3355 else if (r < 0) {
3356 *exit_status = EXIT_OOM_ADJUST;
3357 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3358 }
3359 }
3360
3361 if (context->coredump_filter_set) {
3362 r = set_coredump_filter(context->coredump_filter);
3363 if (ERRNO_IS_PRIVILEGE(r))
3364 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3365 else if (r < 0)
3366 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3367 }
3368
3369 if (context->nice_set) {
3370 r = setpriority_closest(context->nice);
3371 if (r < 0)
3372 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3373 }
3374
3375 if (context->cpu_sched_set) {
3376 struct sched_param param = {
3377 .sched_priority = context->cpu_sched_priority,
3378 };
3379
3380 r = sched_setscheduler(0,
3381 context->cpu_sched_policy |
3382 (context->cpu_sched_reset_on_fork ?
3383 SCHED_RESET_ON_FORK : 0),
3384 &param);
3385 if (r < 0) {
3386 *exit_status = EXIT_SETSCHEDULER;
3387 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3388 }
3389 }
3390
3391 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3392 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3393 const CPUSet *cpu_set;
3394
3395 if (context->cpu_affinity_from_numa) {
3396 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3397 if (r < 0) {
3398 *exit_status = EXIT_CPUAFFINITY;
3399 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3400 }
3401
3402 cpu_set = &converted_cpu_set;
3403 } else
3404 cpu_set = &context->cpu_set;
3405
3406 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3407 *exit_status = EXIT_CPUAFFINITY;
3408 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3409 }
3410 }
3411
3412 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3413 r = apply_numa_policy(&context->numa_policy);
3414 if (r == -EOPNOTSUPP)
3415 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3416 else if (r < 0) {
3417 *exit_status = EXIT_NUMA_POLICY;
3418 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3419 }
3420 }
3421
3422 if (context->ioprio_set)
3423 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3424 *exit_status = EXIT_IOPRIO;
3425 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3426 }
3427
3428 if (context->timer_slack_nsec != NSEC_INFINITY)
3429 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3430 *exit_status = EXIT_TIMERSLACK;
3431 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3432 }
3433
3434 if (context->personality != PERSONALITY_INVALID) {
3435 r = safe_personality(context->personality);
3436 if (r < 0) {
3437 *exit_status = EXIT_PERSONALITY;
3438 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3439 }
3440 }
3441
3442 if (context->utmp_id)
3443 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3444 context->tty_path,
3445 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3446 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3447 USER_PROCESS,
3448 username);
3449
3450 if (uid_is_valid(uid)) {
3451 r = chown_terminal(STDIN_FILENO, uid);
3452 if (r < 0) {
3453 *exit_status = EXIT_STDIN;
3454 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3455 }
3456 }
3457
3458 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3459 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3460 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3461 * touch a single hierarchy too. */
3462 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3463 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3464 if (r < 0) {
3465 *exit_status = EXIT_CGROUP;
3466 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3467 }
3468 }
3469
3470 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3471 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3472 if (r < 0)
3473 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3474 }
3475
3476 r = build_environment(
3477 unit,
3478 context,
3479 params,
3480 n_fds,
3481 home,
3482 username,
3483 shell,
3484 journal_stream_dev,
3485 journal_stream_ino,
3486 &our_env);
3487 if (r < 0) {
3488 *exit_status = EXIT_MEMORY;
3489 return log_oom();
3490 }
3491
3492 r = build_pass_environment(context, &pass_env);
3493 if (r < 0) {
3494 *exit_status = EXIT_MEMORY;
3495 return log_oom();
3496 }
3497
3498 accum_env = strv_env_merge(5,
3499 params->environment,
3500 our_env,
3501 pass_env,
3502 context->environment,
3503 files_env);
3504 if (!accum_env) {
3505 *exit_status = EXIT_MEMORY;
3506 return log_oom();
3507 }
3508 accum_env = strv_env_clean(accum_env);
3509
3510 (void) umask(context->umask);
3511
3512 r = setup_keyring(unit, context, params, uid, gid);
3513 if (r < 0) {
3514 *exit_status = EXIT_KEYRING;
3515 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3516 }
3517
3518 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3519 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3520
3521 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3522 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3523
3524 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3525 if (needs_ambient_hack)
3526 needs_setuid = false;
3527 else
3528 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3529
3530 if (needs_sandboxing) {
3531 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3532 * present. The actual MAC context application will happen later, as late as possible, to avoid
3533 * impacting our own code paths. */
3534
3535 #if HAVE_SELINUX
3536 use_selinux = mac_selinux_use();
3537 #endif
3538 #if ENABLE_SMACK
3539 use_smack = mac_smack_use();
3540 #endif
3541 #if HAVE_APPARMOR
3542 use_apparmor = mac_apparmor_use();
3543 #endif
3544 }
3545
3546 if (needs_sandboxing) {
3547 int which_failed;
3548
3549 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3550 * is set here. (See below.) */
3551
3552 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3553 if (r < 0) {
3554 *exit_status = EXIT_LIMITS;
3555 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3556 }
3557 }
3558
3559 if (needs_setuid) {
3560
3561 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3562 * wins here. (See above.) */
3563
3564 if (context->pam_name && username) {
3565 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3566 if (r < 0) {
3567 *exit_status = EXIT_PAM;
3568 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3569 }
3570
3571 ngids_after_pam = getgroups_alloc(&gids_after_pam);
3572 if (ngids_after_pam < 0) {
3573 *exit_status = EXIT_MEMORY;
3574 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3575 }
3576 }
3577 }
3578
3579 if (needs_sandboxing) {
3580 #if HAVE_SELINUX
3581 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3582 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3583 if (r < 0) {
3584 *exit_status = EXIT_SELINUX_CONTEXT;
3585 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3586 }
3587 }
3588 #endif
3589
3590 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3591 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3592 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3593 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3594 userns_set_up = true;
3595 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3596 if (r < 0) {
3597 *exit_status = EXIT_USER;
3598 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3599 }
3600 }
3601 }
3602
3603 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3604
3605 if (ns_type_supported(NAMESPACE_NET)) {
3606 r = setup_netns(runtime->netns_storage_socket);
3607 if (r == -EPERM)
3608 log_unit_warning_errno(unit, r,
3609 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3610 else if (r < 0) {
3611 *exit_status = EXIT_NETWORK;
3612 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3613 }
3614 } else if (context->network_namespace_path) {
3615 *exit_status = EXIT_NETWORK;
3616 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3617 "NetworkNamespacePath= is not supported, refusing.");
3618 } else
3619 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3620 }
3621
3622 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3623 if (needs_mount_namespace) {
3624 _cleanup_free_ char *error_path = NULL;
3625
3626 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3627 if (r < 0) {
3628 *exit_status = EXIT_NAMESPACE;
3629 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3630 error_path ? ": " : "", strempty(error_path));
3631 }
3632 }
3633
3634 if (needs_sandboxing) {
3635 r = apply_protect_hostname(unit, context, exit_status);
3636 if (r < 0)
3637 return r;
3638 }
3639
3640 /* Drop groups as early as possible.
3641 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3642 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3643 if (needs_setuid) {
3644 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3645 int ngids_to_enforce = 0;
3646
3647 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3648 ngids,
3649 gids_after_pam,
3650 ngids_after_pam,
3651 &gids_to_enforce);
3652 if (ngids_to_enforce < 0) {
3653 *exit_status = EXIT_MEMORY;
3654 return log_unit_error_errno(unit,
3655 ngids_to_enforce,
3656 "Failed to merge group lists. Group membership might be incorrect: %m");
3657 }
3658
3659 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3660 if (r < 0) {
3661 *exit_status = EXIT_GROUP;
3662 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3663 }
3664 }
3665
3666 /* If the user namespace was not set up above, try to do it now.
3667 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3668 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3669 * case of mount namespaces being less privileged when the mount point list is copied from a
3670 * different user namespace). */
3671
3672 if (needs_sandboxing && context->private_users && !userns_set_up) {
3673 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3674 if (r < 0) {
3675 *exit_status = EXIT_USER;
3676 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3677 }
3678 }
3679
3680 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3681 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3682 * however if we have it as we want to keep it open until the final execve(). */
3683
3684 if (params->exec_fd >= 0) {
3685 exec_fd = params->exec_fd;
3686
3687 if (exec_fd < 3 + (int) n_fds) {
3688 int moved_fd;
3689
3690 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3691 * process we are about to execute. */
3692
3693 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3694 if (moved_fd < 0) {
3695 *exit_status = EXIT_FDS;
3696 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3697 }
3698
3699 safe_close(exec_fd);
3700 exec_fd = moved_fd;
3701 } else {
3702 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3703 r = fd_cloexec(exec_fd, true);
3704 if (r < 0) {
3705 *exit_status = EXIT_FDS;
3706 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3707 }
3708 }
3709
3710 fds_with_exec_fd = newa(int, n_fds + 1);
3711 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3712 fds_with_exec_fd[n_fds] = exec_fd;
3713 n_fds_with_exec_fd = n_fds + 1;
3714 } else {
3715 fds_with_exec_fd = fds;
3716 n_fds_with_exec_fd = n_fds;
3717 }
3718
3719 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3720 if (r >= 0)
3721 r = shift_fds(fds, n_fds);
3722 if (r >= 0)
3723 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3724 if (r < 0) {
3725 *exit_status = EXIT_FDS;
3726 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3727 }
3728
3729 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3730 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3731 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3732 * came this far. */
3733
3734 secure_bits = context->secure_bits;
3735
3736 if (needs_sandboxing) {
3737 uint64_t bset;
3738
3739 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3740 * requested. (Note this is placed after the general resource limit initialization, see
3741 * above, in order to take precedence.) */
3742 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3743 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3744 *exit_status = EXIT_LIMITS;
3745 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3746 }
3747 }
3748
3749 #if ENABLE_SMACK
3750 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3751 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3752 if (use_smack) {
3753 r = setup_smack(context, command);
3754 if (r < 0) {
3755 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3756 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3757 }
3758 }
3759 #endif
3760
3761 bset = context->capability_bounding_set;
3762 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3763 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3764 * instead of us doing that */
3765 if (needs_ambient_hack)
3766 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3767 (UINT64_C(1) << CAP_SETUID) |
3768 (UINT64_C(1) << CAP_SETGID);
3769
3770 if (!cap_test_all(bset)) {
3771 r = capability_bounding_set_drop(bset, false);
3772 if (r < 0) {
3773 *exit_status = EXIT_CAPABILITIES;
3774 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3775 }
3776 }
3777
3778 /* This is done before enforce_user, but ambient set
3779 * does not survive over setresuid() if keep_caps is not set. */
3780 if (!needs_ambient_hack) {
3781 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3782 if (r < 0) {
3783 *exit_status = EXIT_CAPABILITIES;
3784 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3785 }
3786 }
3787 }
3788
3789 /* chroot to root directory first, before we lose the ability to chroot */
3790 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3791 if (r < 0)
3792 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3793
3794 if (needs_setuid) {
3795 if (uid_is_valid(uid)) {
3796 r = enforce_user(context, uid);
3797 if (r < 0) {
3798 *exit_status = EXIT_USER;
3799 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3800 }
3801
3802 if (!needs_ambient_hack &&
3803 context->capability_ambient_set != 0) {
3804
3805 /* Fix the ambient capabilities after user change. */
3806 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3807 if (r < 0) {
3808 *exit_status = EXIT_CAPABILITIES;
3809 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3810 }
3811
3812 /* If we were asked to change user and ambient capabilities
3813 * were requested, we had to add keep-caps to the securebits
3814 * so that we would maintain the inherited capability set
3815 * through the setresuid(). Make sure that the bit is added
3816 * also to the context secure_bits so that we don't try to
3817 * drop the bit away next. */
3818
3819 secure_bits |= 1<<SECURE_KEEP_CAPS;
3820 }
3821 }
3822 }
3823
3824 /* Apply working directory here, because the working directory might be on NFS and only the user running
3825 * this service might have the correct privilege to change to the working directory */
3826 r = apply_working_directory(context, params, home, exit_status);
3827 if (r < 0)
3828 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3829
3830 if (needs_sandboxing) {
3831 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3832 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3833 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3834 * are restricted. */
3835
3836 #if HAVE_SELINUX
3837 if (use_selinux) {
3838 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3839
3840 if (exec_context) {
3841 r = setexeccon(exec_context);
3842 if (r < 0) {
3843 *exit_status = EXIT_SELINUX_CONTEXT;
3844 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3845 }
3846 }
3847 }
3848 #endif
3849
3850 #if HAVE_APPARMOR
3851 if (use_apparmor && context->apparmor_profile) {
3852 r = aa_change_onexec(context->apparmor_profile);
3853 if (r < 0 && !context->apparmor_profile_ignore) {
3854 *exit_status = EXIT_APPARMOR_PROFILE;
3855 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3856 }
3857 }
3858 #endif
3859
3860 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3861 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3862 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3863 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3864 *exit_status = EXIT_SECUREBITS;
3865 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3866 }
3867
3868 if (context_has_no_new_privileges(context))
3869 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3870 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3871 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3872 }
3873
3874 #if HAVE_SECCOMP
3875 r = apply_address_families(unit, context);
3876 if (r < 0) {
3877 *exit_status = EXIT_ADDRESS_FAMILIES;
3878 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3879 }
3880
3881 r = apply_memory_deny_write_execute(unit, context);
3882 if (r < 0) {
3883 *exit_status = EXIT_SECCOMP;
3884 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3885 }
3886
3887 r = apply_restrict_realtime(unit, context);
3888 if (r < 0) {
3889 *exit_status = EXIT_SECCOMP;
3890 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3891 }
3892
3893 r = apply_restrict_suid_sgid(unit, context);
3894 if (r < 0) {
3895 *exit_status = EXIT_SECCOMP;
3896 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3897 }
3898
3899 r = apply_restrict_namespaces(unit, context);
3900 if (r < 0) {
3901 *exit_status = EXIT_SECCOMP;
3902 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3903 }
3904
3905 r = apply_protect_sysctl(unit, context);
3906 if (r < 0) {
3907 *exit_status = EXIT_SECCOMP;
3908 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3909 }
3910
3911 r = apply_protect_kernel_modules(unit, context);
3912 if (r < 0) {
3913 *exit_status = EXIT_SECCOMP;
3914 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3915 }
3916
3917 r = apply_protect_kernel_logs(unit, context);
3918 if (r < 0) {
3919 *exit_status = EXIT_SECCOMP;
3920 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3921 }
3922
3923 r = apply_protect_clock(unit, context);
3924 if (r < 0) {
3925 *exit_status = EXIT_SECCOMP;
3926 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3927 }
3928
3929 r = apply_private_devices(unit, context);
3930 if (r < 0) {
3931 *exit_status = EXIT_SECCOMP;
3932 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3933 }
3934
3935 r = apply_syscall_archs(unit, context);
3936 if (r < 0) {
3937 *exit_status = EXIT_SECCOMP;
3938 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3939 }
3940
3941 r = apply_lock_personality(unit, context);
3942 if (r < 0) {
3943 *exit_status = EXIT_SECCOMP;
3944 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3945 }
3946
3947 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3948 * by the filter as little as possible. */
3949 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3950 if (r < 0) {
3951 *exit_status = EXIT_SECCOMP;
3952 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3953 }
3954 #endif
3955 }
3956
3957 if (!strv_isempty(context->unset_environment)) {
3958 char **ee = NULL;
3959
3960 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3961 if (!ee) {
3962 *exit_status = EXIT_MEMORY;
3963 return log_oom();
3964 }
3965
3966 strv_free_and_replace(accum_env, ee);
3967 }
3968
3969 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3970 replaced_argv = replace_env_argv(command->argv, accum_env);
3971 if (!replaced_argv) {
3972 *exit_status = EXIT_MEMORY;
3973 return log_oom();
3974 }
3975 final_argv = replaced_argv;
3976 } else
3977 final_argv = command->argv;
3978
3979 if (DEBUG_LOGGING) {
3980 _cleanup_free_ char *line;
3981
3982 line = exec_command_line(final_argv);
3983 if (line)
3984 log_struct(LOG_DEBUG,
3985 "EXECUTABLE=%s", command->path,
3986 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3987 LOG_UNIT_ID(unit),
3988 LOG_UNIT_INVOCATION_ID(unit));
3989 }
3990
3991 if (exec_fd >= 0) {
3992 uint8_t hot = 1;
3993
3994 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3995 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3996
3997 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3998 *exit_status = EXIT_EXEC;
3999 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4000 }
4001 }
4002
4003 execve(command->path, final_argv, accum_env);
4004 r = -errno;
4005
4006 if (exec_fd >= 0) {
4007 uint8_t hot = 0;
4008
4009 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4010 * that POLLHUP on it no longer means execve() succeeded. */
4011
4012 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4013 *exit_status = EXIT_EXEC;
4014 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4015 }
4016 }
4017
4018 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4019 log_struct_errno(LOG_INFO, r,
4020 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4021 LOG_UNIT_ID(unit),
4022 LOG_UNIT_INVOCATION_ID(unit),
4023 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4024 command->path),
4025 "EXECUTABLE=%s", command->path);
4026 return 0;
4027 }
4028
4029 *exit_status = EXIT_EXEC;
4030 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4031 }
4032
4033 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4034 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4035
4036 int exec_spawn(Unit *unit,
4037 ExecCommand *command,
4038 const ExecContext *context,
4039 const ExecParameters *params,
4040 ExecRuntime *runtime,
4041 DynamicCreds *dcreds,
4042 pid_t *ret) {
4043
4044 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4045 _cleanup_free_ char *subcgroup_path = NULL;
4046 _cleanup_strv_free_ char **files_env = NULL;
4047 size_t n_storage_fds = 0, n_socket_fds = 0;
4048 _cleanup_free_ char *line = NULL;
4049 pid_t pid;
4050
4051 assert(unit);
4052 assert(command);
4053 assert(context);
4054 assert(ret);
4055 assert(params);
4056 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4057
4058 if (context->std_input == EXEC_INPUT_SOCKET ||
4059 context->std_output == EXEC_OUTPUT_SOCKET ||
4060 context->std_error == EXEC_OUTPUT_SOCKET) {
4061
4062 if (params->n_socket_fds > 1) {
4063 log_unit_error(unit, "Got more than one socket.");
4064 return -EINVAL;
4065 }
4066
4067 if (params->n_socket_fds == 0) {
4068 log_unit_error(unit, "Got no socket.");
4069 return -EINVAL;
4070 }
4071
4072 socket_fd = params->fds[0];
4073 } else {
4074 socket_fd = -1;
4075 fds = params->fds;
4076 n_socket_fds = params->n_socket_fds;
4077 n_storage_fds = params->n_storage_fds;
4078 }
4079
4080 r = exec_context_named_iofds(context, params, named_iofds);
4081 if (r < 0)
4082 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4083
4084 r = exec_context_load_environment(unit, context, &files_env);
4085 if (r < 0)
4086 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4087
4088 line = exec_command_line(command->argv);
4089 if (!line)
4090 return log_oom();
4091
4092 log_struct(LOG_DEBUG,
4093 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4094 "EXECUTABLE=%s", command->path,
4095 LOG_UNIT_ID(unit),
4096 LOG_UNIT_INVOCATION_ID(unit));
4097
4098 if (params->cgroup_path) {
4099 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4100 if (r < 0)
4101 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4102 if (r > 0) { /* We are using a child cgroup */
4103 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4104 if (r < 0)
4105 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4106 }
4107 }
4108
4109 pid = fork();
4110 if (pid < 0)
4111 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4112
4113 if (pid == 0) {
4114 int exit_status = EXIT_SUCCESS;
4115
4116 r = exec_child(unit,
4117 command,
4118 context,
4119 params,
4120 runtime,
4121 dcreds,
4122 socket_fd,
4123 named_iofds,
4124 fds,
4125 n_socket_fds,
4126 n_storage_fds,
4127 files_env,
4128 unit->manager->user_lookup_fds[1],
4129 &exit_status);
4130
4131 if (r < 0) {
4132 const char *status =
4133 exit_status_to_string(exit_status,
4134 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4135
4136 log_struct_errno(LOG_ERR, r,
4137 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4138 LOG_UNIT_ID(unit),
4139 LOG_UNIT_INVOCATION_ID(unit),
4140 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4141 status, command->path),
4142 "EXECUTABLE=%s", command->path);
4143 }
4144
4145 _exit(exit_status);
4146 }
4147
4148 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4149
4150 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4151 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4152 * process will be killed too). */
4153 if (subcgroup_path)
4154 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4155
4156 exec_status_start(&command->exec_status, pid);
4157
4158 *ret = pid;
4159 return 0;
4160 }
4161
4162 void exec_context_init(ExecContext *c) {
4163 ExecDirectoryType i;
4164
4165 assert(c);
4166
4167 c->umask = 0022;
4168 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4169 c->cpu_sched_policy = SCHED_OTHER;
4170 c->syslog_priority = LOG_DAEMON|LOG_INFO;
4171 c->syslog_level_prefix = true;
4172 c->ignore_sigpipe = true;
4173 c->timer_slack_nsec = NSEC_INFINITY;
4174 c->personality = PERSONALITY_INVALID;
4175 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4176 c->directories[i].mode = 0755;
4177 c->timeout_clean_usec = USEC_INFINITY;
4178 c->capability_bounding_set = CAP_ALL;
4179 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4180 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4181 c->log_level_max = -1;
4182 numa_policy_reset(&c->numa_policy);
4183 }
4184
4185 void exec_context_done(ExecContext *c) {
4186 ExecDirectoryType i;
4187 size_t l;
4188
4189 assert(c);
4190
4191 c->environment = strv_free(c->environment);
4192 c->environment_files = strv_free(c->environment_files);
4193 c->pass_environment = strv_free(c->pass_environment);
4194 c->unset_environment = strv_free(c->unset_environment);
4195
4196 rlimit_free_all(c->rlimit);
4197
4198 for (l = 0; l < 3; l++) {
4199 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4200 c->stdio_file[l] = mfree(c->stdio_file[l]);
4201 }
4202
4203 c->working_directory = mfree(c->working_directory);
4204 c->root_directory = mfree(c->root_directory);
4205 c->root_image = mfree(c->root_image);
4206 c->tty_path = mfree(c->tty_path);
4207 c->syslog_identifier = mfree(c->syslog_identifier);
4208 c->user = mfree(c->user);
4209 c->group = mfree(c->group);
4210
4211 c->supplementary_groups = strv_free(c->supplementary_groups);
4212
4213 c->pam_name = mfree(c->pam_name);
4214
4215 c->read_only_paths = strv_free(c->read_only_paths);
4216 c->read_write_paths = strv_free(c->read_write_paths);
4217 c->inaccessible_paths = strv_free(c->inaccessible_paths);
4218
4219 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4220 c->bind_mounts = NULL;
4221 c->n_bind_mounts = 0;
4222 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4223 c->temporary_filesystems = NULL;
4224 c->n_temporary_filesystems = 0;
4225
4226 cpu_set_reset(&c->cpu_set);
4227 numa_policy_reset(&c->numa_policy);
4228
4229 c->utmp_id = mfree(c->utmp_id);
4230 c->selinux_context = mfree(c->selinux_context);
4231 c->apparmor_profile = mfree(c->apparmor_profile);
4232 c->smack_process_label = mfree(c->smack_process_label);
4233
4234 c->syscall_filter = hashmap_free(c->syscall_filter);
4235 c->syscall_archs = set_free(c->syscall_archs);
4236 c->address_families = set_free(c->address_families);
4237
4238 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4239 c->directories[i].paths = strv_free(c->directories[i].paths);
4240
4241 c->log_level_max = -1;
4242
4243 exec_context_free_log_extra_fields(c);
4244
4245 c->log_ratelimit_interval_usec = 0;
4246 c->log_ratelimit_burst = 0;
4247
4248 c->stdin_data = mfree(c->stdin_data);
4249 c->stdin_data_size = 0;
4250
4251 c->network_namespace_path = mfree(c->network_namespace_path);
4252
4253 c->log_namespace = mfree(c->log_namespace);
4254 }
4255
4256 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4257 char **i;
4258
4259 assert(c);
4260
4261 if (!runtime_prefix)
4262 return 0;
4263
4264 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4265 _cleanup_free_ char *p;
4266
4267 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4268 p = path_join(runtime_prefix, "private", *i);
4269 else
4270 p = path_join(runtime_prefix, *i);
4271 if (!p)
4272 return -ENOMEM;
4273
4274 /* We execute this synchronously, since we need to be sure this is gone when we start the
4275 * service next. */
4276 (void) rm_rf(p, REMOVE_ROOT);
4277 }
4278
4279 return 0;
4280 }
4281
4282 static void exec_command_done(ExecCommand *c) {
4283 assert(c);
4284
4285 c->path = mfree(c->path);
4286 c->argv = strv_free(c->argv);
4287 }
4288
4289 void exec_command_done_array(ExecCommand *c, size_t n) {
4290 size_t i;
4291
4292 for (i = 0; i < n; i++)
4293 exec_command_done(c+i);
4294 }
4295
4296 ExecCommand* exec_command_free_list(ExecCommand *c) {
4297 ExecCommand *i;
4298
4299 while ((i = c)) {
4300 LIST_REMOVE(command, c, i);
4301 exec_command_done(i);
4302 free(i);
4303 }
4304
4305 return NULL;
4306 }
4307
4308 void exec_command_free_array(ExecCommand **c, size_t n) {
4309 size_t i;
4310
4311 for (i = 0; i < n; i++)
4312 c[i] = exec_command_free_list(c[i]);
4313 }
4314
4315 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4316 size_t i;
4317
4318 for (i = 0; i < n; i++)
4319 exec_status_reset(&c[i].exec_status);
4320 }
4321
4322 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4323 size_t i;
4324
4325 for (i = 0; i < n; i++) {
4326 ExecCommand *z;
4327
4328 LIST_FOREACH(command, z, c[i])
4329 exec_status_reset(&z->exec_status);
4330 }
4331 }
4332
4333 typedef struct InvalidEnvInfo {
4334 const Unit *unit;
4335 const char *path;
4336 } InvalidEnvInfo;
4337
4338 static void invalid_env(const char *p, void *userdata) {
4339 InvalidEnvInfo *info = userdata;
4340
4341 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4342 }
4343
4344 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4345 assert(c);
4346
4347 switch (fd_index) {
4348
4349 case STDIN_FILENO:
4350 if (c->std_input != EXEC_INPUT_NAMED_FD)
4351 return NULL;
4352
4353 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4354
4355 case STDOUT_FILENO:
4356 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4357 return NULL;
4358
4359 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4360
4361 case STDERR_FILENO:
4362 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4363 return NULL;
4364
4365 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4366
4367 default:
4368 return NULL;
4369 }
4370 }
4371
4372 static int exec_context_named_iofds(
4373 const ExecContext *c,
4374 const ExecParameters *p,
4375 int named_iofds[static 3]) {
4376
4377 size_t i, targets;
4378 const char* stdio_fdname[3];
4379 size_t n_fds;
4380
4381 assert(c);
4382 assert(p);
4383 assert(named_iofds);
4384
4385 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4386 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4387 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4388
4389 for (i = 0; i < 3; i++)
4390 stdio_fdname[i] = exec_context_fdname(c, i);
4391
4392 n_fds = p->n_storage_fds + p->n_socket_fds;
4393
4394 for (i = 0; i < n_fds && targets > 0; i++)
4395 if (named_iofds[STDIN_FILENO] < 0 &&
4396 c->std_input == EXEC_INPUT_NAMED_FD &&
4397 stdio_fdname[STDIN_FILENO] &&
4398 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4399
4400 named_iofds[STDIN_FILENO] = p->fds[i];
4401 targets--;
4402
4403 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4404 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4405 stdio_fdname[STDOUT_FILENO] &&
4406 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4407
4408 named_iofds[STDOUT_FILENO] = p->fds[i];
4409 targets--;
4410
4411 } else if (named_iofds[STDERR_FILENO] < 0 &&
4412 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4413 stdio_fdname[STDERR_FILENO] &&
4414 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4415
4416 named_iofds[STDERR_FILENO] = p->fds[i];
4417 targets--;
4418 }
4419
4420 return targets == 0 ? 0 : -ENOENT;
4421 }
4422
4423 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4424 char **i, **r = NULL;
4425
4426 assert(c);
4427 assert(l);
4428
4429 STRV_FOREACH(i, c->environment_files) {
4430 char *fn;
4431 int k;
4432 unsigned n;
4433 bool ignore = false;
4434 char **p;
4435 _cleanup_globfree_ glob_t pglob = {};
4436
4437 fn = *i;
4438
4439 if (fn[0] == '-') {
4440 ignore = true;
4441 fn++;
4442 }
4443
4444 if (!path_is_absolute(fn)) {
4445 if (ignore)
4446 continue;
4447
4448 strv_free(r);
4449 return -EINVAL;
4450 }
4451
4452 /* Filename supports globbing, take all matching files */
4453 k = safe_glob(fn, 0, &pglob);
4454 if (k < 0) {
4455 if (ignore)
4456 continue;
4457
4458 strv_free(r);
4459 return k;
4460 }
4461
4462 /* When we don't match anything, -ENOENT should be returned */
4463 assert(pglob.gl_pathc > 0);
4464
4465 for (n = 0; n < pglob.gl_pathc; n++) {
4466 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4467 if (k < 0) {
4468 if (ignore)
4469 continue;
4470
4471 strv_free(r);
4472 return k;
4473 }
4474 /* Log invalid environment variables with filename */
4475 if (p) {
4476 InvalidEnvInfo info = {
4477 .unit = unit,
4478 .path = pglob.gl_pathv[n]
4479 };
4480
4481 p = strv_env_clean_with_callback(p, invalid_env, &info);
4482 }
4483
4484 if (!r)
4485 r = p;
4486 else {
4487 char **m;
4488
4489 m = strv_env_merge(2, r, p);
4490 strv_free(r);
4491 strv_free(p);
4492 if (!m)
4493 return -ENOMEM;
4494
4495 r = m;
4496 }
4497 }
4498 }
4499
4500 *l = r;
4501
4502 return 0;
4503 }
4504
4505 static bool tty_may_match_dev_console(const char *tty) {
4506 _cleanup_free_ char *resolved = NULL;
4507
4508 if (!tty)
4509 return true;
4510
4511 tty = skip_dev_prefix(tty);
4512
4513 /* trivial identity? */
4514 if (streq(tty, "console"))
4515 return true;
4516
4517 if (resolve_dev_console(&resolved) < 0)
4518 return true; /* if we could not resolve, assume it may */
4519
4520 /* "tty0" means the active VC, so it may be the same sometimes */
4521 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4522 }
4523
4524 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4525 assert(ec);
4526
4527 return ec->tty_reset ||
4528 ec->tty_vhangup ||
4529 ec->tty_vt_disallocate ||
4530 is_terminal_input(ec->std_input) ||
4531 is_terminal_output(ec->std_output) ||
4532 is_terminal_output(ec->std_error);
4533 }
4534
4535 bool exec_context_may_touch_console(const ExecContext *ec) {
4536
4537 return exec_context_may_touch_tty(ec) &&
4538 tty_may_match_dev_console(exec_context_tty_path(ec));
4539 }
4540
4541 static void strv_fprintf(FILE *f, char **l) {
4542 char **g;
4543
4544 assert(f);
4545
4546 STRV_FOREACH(g, l)
4547 fprintf(f, " %s", *g);
4548 }
4549
4550 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4551 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4552 ExecDirectoryType dt;
4553 unsigned i;
4554 int r;
4555
4556 assert(c);
4557 assert(f);
4558
4559 prefix = strempty(prefix);
4560
4561 fprintf(f,
4562 "%sUMask: %04o\n"
4563 "%sWorkingDirectory: %s\n"
4564 "%sRootDirectory: %s\n"
4565 "%sNonBlocking: %s\n"
4566 "%sPrivateTmp: %s\n"
4567 "%sPrivateDevices: %s\n"
4568 "%sProtectKernelTunables: %s\n"
4569 "%sProtectKernelModules: %s\n"
4570 "%sProtectKernelLogs: %s\n"
4571 "%sProtectClock: %s\n"
4572 "%sProtectControlGroups: %s\n"
4573 "%sPrivateNetwork: %s\n"
4574 "%sPrivateUsers: %s\n"
4575 "%sProtectHome: %s\n"
4576 "%sProtectSystem: %s\n"
4577 "%sMountAPIVFS: %s\n"
4578 "%sIgnoreSIGPIPE: %s\n"
4579 "%sMemoryDenyWriteExecute: %s\n"
4580 "%sRestrictRealtime: %s\n"
4581 "%sRestrictSUIDSGID: %s\n"
4582 "%sKeyringMode: %s\n"
4583 "%sProtectHostname: %s\n",
4584 prefix, c->umask,
4585 prefix, c->working_directory ? c->working_directory : "/",
4586 prefix, c->root_directory ? c->root_directory : "/",
4587 prefix, yes_no(c->non_blocking),
4588 prefix, yes_no(c->private_tmp),
4589 prefix, yes_no(c->private_devices),
4590 prefix, yes_no(c->protect_kernel_tunables),
4591 prefix, yes_no(c->protect_kernel_modules),
4592 prefix, yes_no(c->protect_kernel_logs),
4593 prefix, yes_no(c->protect_clock),
4594 prefix, yes_no(c->protect_control_groups),
4595 prefix, yes_no(c->private_network),
4596 prefix, yes_no(c->private_users),
4597 prefix, protect_home_to_string(c->protect_home),
4598 prefix, protect_system_to_string(c->protect_system),
4599 prefix, yes_no(c->mount_apivfs),
4600 prefix, yes_no(c->ignore_sigpipe),
4601 prefix, yes_no(c->memory_deny_write_execute),
4602 prefix, yes_no(c->restrict_realtime),
4603 prefix, yes_no(c->restrict_suid_sgid),
4604 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4605 prefix, yes_no(c->protect_hostname));
4606
4607 if (c->root_image)
4608 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4609
4610 STRV_FOREACH(e, c->environment)
4611 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4612
4613 STRV_FOREACH(e, c->environment_files)
4614 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4615
4616 STRV_FOREACH(e, c->pass_environment)
4617 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4618
4619 STRV_FOREACH(e, c->unset_environment)
4620 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4621
4622 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4623
4624 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4625 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4626
4627 STRV_FOREACH(d, c->directories[dt].paths)
4628 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4629 }
4630
4631 fprintf(f,
4632 "%sTimeoutCleanSec: %s\n",
4633 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4634
4635 if (c->nice_set)
4636 fprintf(f,
4637 "%sNice: %i\n",
4638 prefix, c->nice);
4639
4640 if (c->oom_score_adjust_set)
4641 fprintf(f,
4642 "%sOOMScoreAdjust: %i\n",
4643 prefix, c->oom_score_adjust);
4644
4645 if (c->coredump_filter_set)
4646 fprintf(f,
4647 "%sCoredumpFilter: 0x%"PRIx64"\n",
4648 prefix, c->coredump_filter);
4649
4650 for (i = 0; i < RLIM_NLIMITS; i++)
4651 if (c->rlimit[i]) {
4652 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4653 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4654 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4655 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4656 }
4657
4658 if (c->ioprio_set) {
4659 _cleanup_free_ char *class_str = NULL;
4660
4661 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4662 if (r >= 0)
4663 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4664
4665 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4666 }
4667
4668 if (c->cpu_sched_set) {
4669 _cleanup_free_ char *policy_str = NULL;
4670
4671 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4672 if (r >= 0)
4673 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4674
4675 fprintf(f,
4676 "%sCPUSchedulingPriority: %i\n"
4677 "%sCPUSchedulingResetOnFork: %s\n",
4678 prefix, c->cpu_sched_priority,
4679 prefix, yes_no(c->cpu_sched_reset_on_fork));
4680 }
4681
4682 if (c->cpu_set.set) {
4683 _cleanup_free_ char *affinity = NULL;
4684
4685 affinity = cpu_set_to_range_string(&c->cpu_set);
4686 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4687 }
4688
4689 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4690 _cleanup_free_ char *nodes = NULL;
4691
4692 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4693 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4694 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4695 }
4696
4697 if (c->timer_slack_nsec != NSEC_INFINITY)
4698 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4699
4700 fprintf(f,
4701 "%sStandardInput: %s\n"
4702 "%sStandardOutput: %s\n"
4703 "%sStandardError: %s\n",
4704 prefix, exec_input_to_string(c->std_input),
4705 prefix, exec_output_to_string(c->std_output),
4706 prefix, exec_output_to_string(c->std_error));
4707
4708 if (c->std_input == EXEC_INPUT_NAMED_FD)
4709 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4710 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4711 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4712 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4713 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4714
4715 if (c->std_input == EXEC_INPUT_FILE)
4716 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4717 if (c->std_output == EXEC_OUTPUT_FILE)
4718 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4719 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4720 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4721 if (c->std_error == EXEC_OUTPUT_FILE)
4722 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4723 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4724 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4725
4726 if (c->tty_path)
4727 fprintf(f,
4728 "%sTTYPath: %s\n"
4729 "%sTTYReset: %s\n"
4730 "%sTTYVHangup: %s\n"
4731 "%sTTYVTDisallocate: %s\n",
4732 prefix, c->tty_path,
4733 prefix, yes_no(c->tty_reset),
4734 prefix, yes_no(c->tty_vhangup),
4735 prefix, yes_no(c->tty_vt_disallocate));
4736
4737 if (IN_SET(c->std_output,
4738 EXEC_OUTPUT_SYSLOG,
4739 EXEC_OUTPUT_KMSG,
4740 EXEC_OUTPUT_JOURNAL,
4741 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4742 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4743 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4744 IN_SET(c->std_error,
4745 EXEC_OUTPUT_SYSLOG,
4746 EXEC_OUTPUT_KMSG,
4747 EXEC_OUTPUT_JOURNAL,
4748 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4749 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4750 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4751
4752 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4753
4754 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4755 if (r >= 0)
4756 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4757
4758 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4759 if (r >= 0)
4760 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4761 }
4762
4763 if (c->log_level_max >= 0) {
4764 _cleanup_free_ char *t = NULL;
4765
4766 (void) log_level_to_string_alloc(c->log_level_max, &t);
4767
4768 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4769 }
4770
4771 if (c->log_ratelimit_interval_usec > 0) {
4772 char buf_timespan[FORMAT_TIMESPAN_MAX];
4773
4774 fprintf(f,
4775 "%sLogRateLimitIntervalSec: %s\n",
4776 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4777 }
4778
4779 if (c->log_ratelimit_burst > 0)
4780 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4781
4782 if (c->n_log_extra_fields > 0) {
4783 size_t j;
4784
4785 for (j = 0; j < c->n_log_extra_fields; j++) {
4786 fprintf(f, "%sLogExtraFields: ", prefix);
4787 fwrite(c->log_extra_fields[j].iov_base,
4788 1, c->log_extra_fields[j].iov_len,
4789 f);
4790 fputc('\n', f);
4791 }
4792 }
4793
4794 if (c->log_namespace)
4795 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4796
4797 if (c->secure_bits) {
4798 _cleanup_free_ char *str = NULL;
4799
4800 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4801 if (r >= 0)
4802 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4803 }
4804
4805 if (c->capability_bounding_set != CAP_ALL) {
4806 _cleanup_free_ char *str = NULL;
4807
4808 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4809 if (r >= 0)
4810 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4811 }
4812
4813 if (c->capability_ambient_set != 0) {
4814 _cleanup_free_ char *str = NULL;
4815
4816 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4817 if (r >= 0)
4818 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4819 }
4820
4821 if (c->user)
4822 fprintf(f, "%sUser: %s\n", prefix, c->user);
4823 if (c->group)
4824 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4825
4826 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4827
4828 if (!strv_isempty(c->supplementary_groups)) {
4829 fprintf(f, "%sSupplementaryGroups:", prefix);
4830 strv_fprintf(f, c->supplementary_groups);
4831 fputs("\n", f);
4832 }
4833
4834 if (c->pam_name)
4835 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4836
4837 if (!strv_isempty(c->read_write_paths)) {
4838 fprintf(f, "%sReadWritePaths:", prefix);
4839 strv_fprintf(f, c->read_write_paths);
4840 fputs("\n", f);
4841 }
4842
4843 if (!strv_isempty(c->read_only_paths)) {
4844 fprintf(f, "%sReadOnlyPaths:", prefix);
4845 strv_fprintf(f, c->read_only_paths);
4846 fputs("\n", f);
4847 }
4848
4849 if (!strv_isempty(c->inaccessible_paths)) {
4850 fprintf(f, "%sInaccessiblePaths:", prefix);
4851 strv_fprintf(f, c->inaccessible_paths);
4852 fputs("\n", f);
4853 }
4854
4855 if (c->n_bind_mounts > 0)
4856 for (i = 0; i < c->n_bind_mounts; i++)
4857 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4858 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4859 c->bind_mounts[i].ignore_enoent ? "-": "",
4860 c->bind_mounts[i].source,
4861 c->bind_mounts[i].destination,
4862 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4863
4864 if (c->n_temporary_filesystems > 0)
4865 for (i = 0; i < c->n_temporary_filesystems; i++) {
4866 TemporaryFileSystem *t = c->temporary_filesystems + i;
4867
4868 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4869 t->path,
4870 isempty(t->options) ? "" : ":",
4871 strempty(t->options));
4872 }
4873
4874 if (c->utmp_id)
4875 fprintf(f,
4876 "%sUtmpIdentifier: %s\n",
4877 prefix, c->utmp_id);
4878
4879 if (c->selinux_context)
4880 fprintf(f,
4881 "%sSELinuxContext: %s%s\n",
4882 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4883
4884 if (c->apparmor_profile)
4885 fprintf(f,
4886 "%sAppArmorProfile: %s%s\n",
4887 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4888
4889 if (c->smack_process_label)
4890 fprintf(f,
4891 "%sSmackProcessLabel: %s%s\n",
4892 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4893
4894 if (c->personality != PERSONALITY_INVALID)
4895 fprintf(f,
4896 "%sPersonality: %s\n",
4897 prefix, strna(personality_to_string(c->personality)));
4898
4899 fprintf(f,
4900 "%sLockPersonality: %s\n",
4901 prefix, yes_no(c->lock_personality));
4902
4903 if (c->syscall_filter) {
4904 #if HAVE_SECCOMP
4905 Iterator j;
4906 void *id, *val;
4907 bool first = true;
4908 #endif
4909
4910 fprintf(f,
4911 "%sSystemCallFilter: ",
4912 prefix);
4913
4914 if (!c->syscall_whitelist)
4915 fputc('~', f);
4916
4917 #if HAVE_SECCOMP
4918 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4919 _cleanup_free_ char *name = NULL;
4920 const char *errno_name = NULL;
4921 int num = PTR_TO_INT(val);
4922
4923 if (first)
4924 first = false;
4925 else
4926 fputc(' ', f);
4927
4928 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4929 fputs(strna(name), f);
4930
4931 if (num >= 0) {
4932 errno_name = errno_to_name(num);
4933 if (errno_name)
4934 fprintf(f, ":%s", errno_name);
4935 else
4936 fprintf(f, ":%d", num);
4937 }
4938 }
4939 #endif
4940
4941 fputc('\n', f);
4942 }
4943
4944 if (c->syscall_archs) {
4945 #if HAVE_SECCOMP
4946 Iterator j;
4947 void *id;
4948 #endif
4949
4950 fprintf(f,
4951 "%sSystemCallArchitectures:",
4952 prefix);
4953
4954 #if HAVE_SECCOMP
4955 SET_FOREACH(id, c->syscall_archs, j)
4956 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4957 #endif
4958 fputc('\n', f);
4959 }
4960
4961 if (exec_context_restrict_namespaces_set(c)) {
4962 _cleanup_free_ char *s = NULL;
4963
4964 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4965 if (r >= 0)
4966 fprintf(f, "%sRestrictNamespaces: %s\n",
4967 prefix, strna(s));
4968 }
4969
4970 if (c->network_namespace_path)
4971 fprintf(f,
4972 "%sNetworkNamespacePath: %s\n",
4973 prefix, c->network_namespace_path);
4974
4975 if (c->syscall_errno > 0) {
4976 const char *errno_name;
4977
4978 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4979
4980 errno_name = errno_to_name(c->syscall_errno);
4981 if (errno_name)
4982 fprintf(f, "%s\n", errno_name);
4983 else
4984 fprintf(f, "%d\n", c->syscall_errno);
4985 }
4986 }
4987
4988 bool exec_context_maintains_privileges(const ExecContext *c) {
4989 assert(c);
4990
4991 /* Returns true if the process forked off would run under
4992 * an unchanged UID or as root. */
4993
4994 if (!c->user)
4995 return true;
4996
4997 if (streq(c->user, "root") || streq(c->user, "0"))
4998 return true;
4999
5000 return false;
5001 }
5002
5003 int exec_context_get_effective_ioprio(const ExecContext *c) {
5004 int p;
5005
5006 assert(c);
5007
5008 if (c->ioprio_set)
5009 return c->ioprio;
5010
5011 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5012 if (p < 0)
5013 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5014
5015 return p;
5016 }
5017
5018 void exec_context_free_log_extra_fields(ExecContext *c) {
5019 size_t l;
5020
5021 assert(c);
5022
5023 for (l = 0; l < c->n_log_extra_fields; l++)
5024 free(c->log_extra_fields[l].iov_base);
5025 c->log_extra_fields = mfree(c->log_extra_fields);
5026 c->n_log_extra_fields = 0;
5027 }
5028
5029 void exec_context_revert_tty(ExecContext *c) {
5030 int r;
5031
5032 assert(c);
5033
5034 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5035 exec_context_tty_reset(c, NULL);
5036
5037 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5038 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5039 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5040
5041 if (exec_context_may_touch_tty(c)) {
5042 const char *path;
5043
5044 path = exec_context_tty_path(c);
5045 if (path) {
5046 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5047 if (r < 0 && r != -ENOENT)
5048 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5049 }
5050 }
5051 }
5052
5053 int exec_context_get_clean_directories(
5054 ExecContext *c,
5055 char **prefix,
5056 ExecCleanMask mask,
5057 char ***ret) {
5058
5059 _cleanup_strv_free_ char **l = NULL;
5060 ExecDirectoryType t;
5061 int r;
5062
5063 assert(c);
5064 assert(prefix);
5065 assert(ret);
5066
5067 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5068 char **i;
5069
5070 if (!FLAGS_SET(mask, 1U << t))
5071 continue;
5072
5073 if (!prefix[t])
5074 continue;
5075
5076 STRV_FOREACH(i, c->directories[t].paths) {
5077 char *j;
5078
5079 j = path_join(prefix[t], *i);
5080 if (!j)
5081 return -ENOMEM;
5082
5083 r = strv_consume(&l, j);
5084 if (r < 0)
5085 return r;
5086
5087 /* Also remove private directories unconditionally. */
5088 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5089 j = path_join(prefix[t], "private", *i);
5090 if (!j)
5091 return -ENOMEM;
5092
5093 r = strv_consume(&l, j);
5094 if (r < 0)
5095 return r;
5096 }
5097 }
5098 }
5099
5100 *ret = TAKE_PTR(l);
5101 return 0;
5102 }
5103
5104 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5105 ExecCleanMask mask = 0;
5106
5107 assert(c);
5108 assert(ret);
5109
5110 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5111 if (!strv_isempty(c->directories[t].paths))
5112 mask |= 1U << t;
5113
5114 *ret = mask;
5115 return 0;
5116 }
5117
5118 void exec_status_start(ExecStatus *s, pid_t pid) {
5119 assert(s);
5120
5121 *s = (ExecStatus) {
5122 .pid = pid,
5123 };
5124
5125 dual_timestamp_get(&s->start_timestamp);
5126 }
5127
5128 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5129 assert(s);
5130
5131 if (s->pid != pid) {
5132 *s = (ExecStatus) {
5133 .pid = pid,
5134 };
5135 }
5136
5137 dual_timestamp_get(&s->exit_timestamp);
5138
5139 s->code = code;
5140 s->status = status;
5141
5142 if (context && context->utmp_id)
5143 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5144 }
5145
5146 void exec_status_reset(ExecStatus *s) {
5147 assert(s);
5148
5149 *s = (ExecStatus) {};
5150 }
5151
5152 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5153 char buf[FORMAT_TIMESTAMP_MAX];
5154
5155 assert(s);
5156 assert(f);
5157
5158 if (s->pid <= 0)
5159 return;
5160
5161 prefix = strempty(prefix);
5162
5163 fprintf(f,
5164 "%sPID: "PID_FMT"\n",
5165 prefix, s->pid);
5166
5167 if (dual_timestamp_is_set(&s->start_timestamp))
5168 fprintf(f,
5169 "%sStart Timestamp: %s\n",
5170 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5171
5172 if (dual_timestamp_is_set(&s->exit_timestamp))
5173 fprintf(f,
5174 "%sExit Timestamp: %s\n"
5175 "%sExit Code: %s\n"
5176 "%sExit Status: %i\n",
5177 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5178 prefix, sigchld_code_to_string(s->code),
5179 prefix, s->status);
5180 }
5181
5182 static char *exec_command_line(char **argv) {
5183 size_t k;
5184 char *n, *p, **a;
5185 bool first = true;
5186
5187 assert(argv);
5188
5189 k = 1;
5190 STRV_FOREACH(a, argv)
5191 k += strlen(*a)+3;
5192
5193 n = new(char, k);
5194 if (!n)
5195 return NULL;
5196
5197 p = n;
5198 STRV_FOREACH(a, argv) {
5199
5200 if (!first)
5201 *(p++) = ' ';
5202 else
5203 first = false;
5204
5205 if (strpbrk(*a, WHITESPACE)) {
5206 *(p++) = '\'';
5207 p = stpcpy(p, *a);
5208 *(p++) = '\'';
5209 } else
5210 p = stpcpy(p, *a);
5211
5212 }
5213
5214 *p = 0;
5215
5216 /* FIXME: this doesn't really handle arguments that have
5217 * spaces and ticks in them */
5218
5219 return n;
5220 }
5221
5222 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5223 _cleanup_free_ char *cmd = NULL;
5224 const char *prefix2;
5225
5226 assert(c);
5227 assert(f);
5228
5229 prefix = strempty(prefix);
5230 prefix2 = strjoina(prefix, "\t");
5231
5232 cmd = exec_command_line(c->argv);
5233 fprintf(f,
5234 "%sCommand Line: %s\n",
5235 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5236
5237 exec_status_dump(&c->exec_status, f, prefix2);
5238 }
5239
5240 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5241 assert(f);
5242
5243 prefix = strempty(prefix);
5244
5245 LIST_FOREACH(command, c, c)
5246 exec_command_dump(c, f, prefix);
5247 }
5248
5249 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5250 ExecCommand *end;
5251
5252 assert(l);
5253 assert(e);
5254
5255 if (*l) {
5256 /* It's kind of important, that we keep the order here */
5257 LIST_FIND_TAIL(command, *l, end);
5258 LIST_INSERT_AFTER(command, *l, end, e);
5259 } else
5260 *l = e;
5261 }
5262
5263 int exec_command_set(ExecCommand *c, const char *path, ...) {
5264 va_list ap;
5265 char **l, *p;
5266
5267 assert(c);
5268 assert(path);
5269
5270 va_start(ap, path);
5271 l = strv_new_ap(path, ap);
5272 va_end(ap);
5273
5274 if (!l)
5275 return -ENOMEM;
5276
5277 p = strdup(path);
5278 if (!p) {
5279 strv_free(l);
5280 return -ENOMEM;
5281 }
5282
5283 free_and_replace(c->path, p);
5284
5285 return strv_free_and_replace(c->argv, l);
5286 }
5287
5288 int exec_command_append(ExecCommand *c, const char *path, ...) {
5289 _cleanup_strv_free_ char **l = NULL;
5290 va_list ap;
5291 int r;
5292
5293 assert(c);
5294 assert(path);
5295
5296 va_start(ap, path);
5297 l = strv_new_ap(path, ap);
5298 va_end(ap);
5299
5300 if (!l)
5301 return -ENOMEM;
5302
5303 r = strv_extend_strv(&c->argv, l, false);
5304 if (r < 0)
5305 return r;
5306
5307 return 0;
5308 }
5309
5310 static void *remove_tmpdir_thread(void *p) {
5311 _cleanup_free_ char *path = p;
5312
5313 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5314 return NULL;
5315 }
5316
5317 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5318 int r;
5319
5320 if (!rt)
5321 return NULL;
5322
5323 if (rt->manager)
5324 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5325
5326 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5327 if (destroy && rt->tmp_dir) {
5328 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5329
5330 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5331 if (r < 0) {
5332 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5333 free(rt->tmp_dir);
5334 }
5335
5336 rt->tmp_dir = NULL;
5337 }
5338
5339 if (destroy && rt->var_tmp_dir) {
5340 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5341
5342 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5343 if (r < 0) {
5344 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5345 free(rt->var_tmp_dir);
5346 }
5347
5348 rt->var_tmp_dir = NULL;
5349 }
5350
5351 rt->id = mfree(rt->id);
5352 rt->tmp_dir = mfree(rt->tmp_dir);
5353 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5354 safe_close_pair(rt->netns_storage_socket);
5355 return mfree(rt);
5356 }
5357
5358 static void exec_runtime_freep(ExecRuntime **rt) {
5359 (void) exec_runtime_free(*rt, false);
5360 }
5361
5362 static int exec_runtime_allocate(ExecRuntime **ret) {
5363 ExecRuntime *n;
5364
5365 assert(ret);
5366
5367 n = new(ExecRuntime, 1);
5368 if (!n)
5369 return -ENOMEM;
5370
5371 *n = (ExecRuntime) {
5372 .netns_storage_socket = { -1, -1 },
5373 };
5374
5375 *ret = n;
5376 return 0;
5377 }
5378
5379 static int exec_runtime_add(
5380 Manager *m,
5381 const char *id,
5382 const char *tmp_dir,
5383 const char *var_tmp_dir,
5384 const int netns_storage_socket[2],
5385 ExecRuntime **ret) {
5386
5387 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5388 int r;
5389
5390 assert(m);
5391 assert(id);
5392
5393 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5394 if (r < 0)
5395 return r;
5396
5397 r = exec_runtime_allocate(&rt);
5398 if (r < 0)
5399 return r;
5400
5401 rt->id = strdup(id);
5402 if (!rt->id)
5403 return -ENOMEM;
5404
5405 if (tmp_dir) {
5406 rt->tmp_dir = strdup(tmp_dir);
5407 if (!rt->tmp_dir)
5408 return -ENOMEM;
5409
5410 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5411 assert(var_tmp_dir);
5412 rt->var_tmp_dir = strdup(var_tmp_dir);
5413 if (!rt->var_tmp_dir)
5414 return -ENOMEM;
5415 }
5416
5417 if (netns_storage_socket) {
5418 rt->netns_storage_socket[0] = netns_storage_socket[0];
5419 rt->netns_storage_socket[1] = netns_storage_socket[1];
5420 }
5421
5422 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5423 if (r < 0)
5424 return r;
5425
5426 rt->manager = m;
5427
5428 if (ret)
5429 *ret = rt;
5430
5431 /* do not remove created ExecRuntime object when the operation succeeds. */
5432 rt = NULL;
5433 return 0;
5434 }
5435
5436 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5437 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5438 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5439 int r;
5440
5441 assert(m);
5442 assert(c);
5443 assert(id);
5444
5445 /* It is not necessary to create ExecRuntime object. */
5446 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5447 return 0;
5448
5449 if (c->private_tmp &&
5450 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
5451 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
5452 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
5453 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5454 if (r < 0)
5455 return r;
5456 }
5457
5458 if (c->private_network || c->network_namespace_path) {
5459 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5460 return -errno;
5461 }
5462
5463 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5464 if (r < 0)
5465 return r;
5466
5467 /* Avoid cleanup */
5468 netns_storage_socket[0] = netns_storage_socket[1] = -1;
5469 return 1;
5470 }
5471
5472 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5473 ExecRuntime *rt;
5474 int r;
5475
5476 assert(m);
5477 assert(id);
5478 assert(ret);
5479
5480 rt = hashmap_get(m->exec_runtime_by_id, id);
5481 if (rt)
5482 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5483 goto ref;
5484
5485 if (!create)
5486 return 0;
5487
5488 /* If not found, then create a new object. */
5489 r = exec_runtime_make(m, c, id, &rt);
5490 if (r <= 0)
5491 /* When r == 0, it is not necessary to create ExecRuntime object. */
5492 return r;
5493
5494 ref:
5495 /* increment reference counter. */
5496 rt->n_ref++;
5497 *ret = rt;
5498 return 1;
5499 }
5500
5501 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5502 if (!rt)
5503 return NULL;
5504
5505 assert(rt->n_ref > 0);
5506
5507 rt->n_ref--;
5508 if (rt->n_ref > 0)
5509 return NULL;
5510
5511 return exec_runtime_free(rt, destroy);
5512 }
5513
5514 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5515 ExecRuntime *rt;
5516 Iterator i;
5517
5518 assert(m);
5519 assert(f);
5520 assert(fds);
5521
5522 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5523 fprintf(f, "exec-runtime=%s", rt->id);
5524
5525 if (rt->tmp_dir)
5526 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5527
5528 if (rt->var_tmp_dir)
5529 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5530
5531 if (rt->netns_storage_socket[0] >= 0) {
5532 int copy;
5533
5534 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5535 if (copy < 0)
5536 return copy;
5537
5538 fprintf(f, " netns-socket-0=%i", copy);
5539 }
5540
5541 if (rt->netns_storage_socket[1] >= 0) {
5542 int copy;
5543
5544 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5545 if (copy < 0)
5546 return copy;
5547
5548 fprintf(f, " netns-socket-1=%i", copy);
5549 }
5550
5551 fputc('\n', f);
5552 }
5553
5554 return 0;
5555 }
5556
5557 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5558 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5559 ExecRuntime *rt;
5560 int r;
5561
5562 /* This is for the migration from old (v237 or earlier) deserialization text.
5563 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5564 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5565 * so or not from the serialized text, then we always creates a new object owned by this. */
5566
5567 assert(u);
5568 assert(key);
5569 assert(value);
5570
5571 /* Manager manages ExecRuntime objects by the unit id.
5572 * So, we omit the serialized text when the unit does not have id (yet?)... */
5573 if (isempty(u->id)) {
5574 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5575 return 0;
5576 }
5577
5578 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5579 if (r < 0) {
5580 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5581 return 0;
5582 }
5583
5584 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5585 if (!rt) {
5586 r = exec_runtime_allocate(&rt_create);
5587 if (r < 0)
5588 return log_oom();
5589
5590 rt_create->id = strdup(u->id);
5591 if (!rt_create->id)
5592 return log_oom();
5593
5594 rt = rt_create;
5595 }
5596
5597 if (streq(key, "tmp-dir")) {
5598 char *copy;
5599
5600 copy = strdup(value);
5601 if (!copy)
5602 return log_oom();
5603
5604 free_and_replace(rt->tmp_dir, copy);
5605
5606 } else if (streq(key, "var-tmp-dir")) {
5607 char *copy;
5608
5609 copy = strdup(value);
5610 if (!copy)
5611 return log_oom();
5612
5613 free_and_replace(rt->var_tmp_dir, copy);
5614
5615 } else if (streq(key, "netns-socket-0")) {
5616 int fd;
5617
5618 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5619 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5620 return 0;
5621 }
5622
5623 safe_close(rt->netns_storage_socket[0]);
5624 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5625
5626 } else if (streq(key, "netns-socket-1")) {
5627 int fd;
5628
5629 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5630 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5631 return 0;
5632 }
5633
5634 safe_close(rt->netns_storage_socket[1]);
5635 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5636 } else
5637 return 0;
5638
5639 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5640 if (rt_create) {
5641 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5642 if (r < 0) {
5643 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5644 return 0;
5645 }
5646
5647 rt_create->manager = u->manager;
5648
5649 /* Avoid cleanup */
5650 rt_create = NULL;
5651 }
5652
5653 return 1;
5654 }
5655
5656 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5657 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5658 int r, fd0 = -1, fd1 = -1;
5659 const char *p, *v = value;
5660 size_t n;
5661
5662 assert(m);
5663 assert(value);
5664 assert(fds);
5665
5666 n = strcspn(v, " ");
5667 id = strndupa(v, n);
5668 if (v[n] != ' ')
5669 goto finalize;
5670 p = v + n + 1;
5671
5672 v = startswith(p, "tmp-dir=");
5673 if (v) {
5674 n = strcspn(v, " ");
5675 tmp_dir = strndupa(v, n);
5676 if (v[n] != ' ')
5677 goto finalize;
5678 p = v + n + 1;
5679 }
5680
5681 v = startswith(p, "var-tmp-dir=");
5682 if (v) {
5683 n = strcspn(v, " ");
5684 var_tmp_dir = strndupa(v, n);
5685 if (v[n] != ' ')
5686 goto finalize;
5687 p = v + n + 1;
5688 }
5689
5690 v = startswith(p, "netns-socket-0=");
5691 if (v) {
5692 char *buf;
5693
5694 n = strcspn(v, " ");
5695 buf = strndupa(v, n);
5696 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5697 log_debug("Unable to process exec-runtime netns fd specification.");
5698 return;
5699 }
5700 fd0 = fdset_remove(fds, fd0);
5701 if (v[n] != ' ')
5702 goto finalize;
5703 p = v + n + 1;
5704 }
5705
5706 v = startswith(p, "netns-socket-1=");
5707 if (v) {
5708 char *buf;
5709
5710 n = strcspn(v, " ");
5711 buf = strndupa(v, n);
5712 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5713 log_debug("Unable to process exec-runtime netns fd specification.");
5714 return;
5715 }
5716 fd1 = fdset_remove(fds, fd1);
5717 }
5718
5719 finalize:
5720
5721 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5722 if (r < 0)
5723 log_debug_errno(r, "Failed to add exec-runtime: %m");
5724 }
5725
5726 void exec_runtime_vacuum(Manager *m) {
5727 ExecRuntime *rt;
5728 Iterator i;
5729
5730 assert(m);
5731
5732 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5733
5734 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5735 if (rt->n_ref > 0)
5736 continue;
5737
5738 (void) exec_runtime_free(rt, false);
5739 }
5740 }
5741
5742 void exec_params_clear(ExecParameters *p) {
5743 if (!p)
5744 return;
5745
5746 strv_free(p->environment);
5747 }
5748
5749 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5750 [EXEC_INPUT_NULL] = "null",
5751 [EXEC_INPUT_TTY] = "tty",
5752 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5753 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5754 [EXEC_INPUT_SOCKET] = "socket",
5755 [EXEC_INPUT_NAMED_FD] = "fd",
5756 [EXEC_INPUT_DATA] = "data",
5757 [EXEC_INPUT_FILE] = "file",
5758 };
5759
5760 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5761
5762 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5763 [EXEC_OUTPUT_INHERIT] = "inherit",
5764 [EXEC_OUTPUT_NULL] = "null",
5765 [EXEC_OUTPUT_TTY] = "tty",
5766 [EXEC_OUTPUT_SYSLOG] = "syslog",
5767 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5768 [EXEC_OUTPUT_KMSG] = "kmsg",
5769 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5770 [EXEC_OUTPUT_JOURNAL] = "journal",
5771 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5772 [EXEC_OUTPUT_SOCKET] = "socket",
5773 [EXEC_OUTPUT_NAMED_FD] = "fd",
5774 [EXEC_OUTPUT_FILE] = "file",
5775 [EXEC_OUTPUT_FILE_APPEND] = "append",
5776 };
5777
5778 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5779
5780 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5781 [EXEC_UTMP_INIT] = "init",
5782 [EXEC_UTMP_LOGIN] = "login",
5783 [EXEC_UTMP_USER] = "user",
5784 };
5785
5786 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5787
5788 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5789 [EXEC_PRESERVE_NO] = "no",
5790 [EXEC_PRESERVE_YES] = "yes",
5791 [EXEC_PRESERVE_RESTART] = "restart",
5792 };
5793
5794 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5795
5796 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5797 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5798 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5799 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5800 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5801 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5802 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5803 };
5804
5805 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5806
5807 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5808 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5809 * directories, specifically .timer units with their timestamp touch file. */
5810 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5811 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5812 [EXEC_DIRECTORY_STATE] = "state",
5813 [EXEC_DIRECTORY_CACHE] = "cache",
5814 [EXEC_DIRECTORY_LOGS] = "logs",
5815 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5816 };
5817
5818 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5819
5820 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5821 * the service payload in. */
5822 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5823 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5824 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5825 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5826 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5827 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5828 };
5829
5830 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5831
5832 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5833 [EXEC_KEYRING_INHERIT] = "inherit",
5834 [EXEC_KEYRING_PRIVATE] = "private",
5835 [EXEC_KEYRING_SHARED] = "shared",
5836 };
5837
5838 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);