]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
Merge pull request #10976 from yuwata/typesafe-netlink-call
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <glob.h>
6 #include <grp.h>
7 #include <poll.h>
8 #include <signal.h>
9 #include <string.h>
10 #include <sys/capability.h>
11 #include <sys/eventfd.h>
12 #include <sys/mman.h>
13 #include <sys/personality.h>
14 #include <sys/prctl.h>
15 #include <sys/shm.h>
16 #include <sys/socket.h>
17 #include <sys/stat.h>
18 #include <sys/types.h>
19 #include <sys/un.h>
20 #include <unistd.h>
21 #include <utmpx.h>
22
23 #if HAVE_PAM
24 #include <security/pam_appl.h>
25 #endif
26
27 #if HAVE_SELINUX
28 #include <selinux/selinux.h>
29 #endif
30
31 #if HAVE_SECCOMP
32 #include <seccomp.h>
33 #endif
34
35 #if HAVE_APPARMOR
36 #include <sys/apparmor.h>
37 #endif
38
39 #include "sd-messages.h"
40
41 #include "af-list.h"
42 #include "alloc-util.h"
43 #if HAVE_APPARMOR
44 #include "apparmor-util.h"
45 #endif
46 #include "async.h"
47 #include "barrier.h"
48 #include "cap-list.h"
49 #include "capability-util.h"
50 #include "chown-recursive.h"
51 #include "cpu-set-util.h"
52 #include "def.h"
53 #include "env-file.h"
54 #include "env-util.h"
55 #include "errno-list.h"
56 #include "execute.h"
57 #include "exit-status.h"
58 #include "fd-util.h"
59 #include "format-util.h"
60 #include "fs-util.h"
61 #include "glob-util.h"
62 #include "io-util.h"
63 #include "ioprio.h"
64 #include "label.h"
65 #include "log.h"
66 #include "macro.h"
67 #include "manager.h"
68 #include "missing.h"
69 #include "mkdir.h"
70 #include "namespace.h"
71 #include "parse-util.h"
72 #include "path-util.h"
73 #include "process-util.h"
74 #include "rlimit-util.h"
75 #include "rm-rf.h"
76 #if HAVE_SECCOMP
77 #include "seccomp-util.h"
78 #endif
79 #include "securebits.h"
80 #include "securebits-util.h"
81 #include "selinux-util.h"
82 #include "signal-util.h"
83 #include "smack-util.h"
84 #include "socket-util.h"
85 #include "special.h"
86 #include "stat-util.h"
87 #include "string-table.h"
88 #include "string-util.h"
89 #include "strv.h"
90 #include "syslog-util.h"
91 #include "terminal-util.h"
92 #include "umask-util.h"
93 #include "unit.h"
94 #include "user-util.h"
95 #include "util.h"
96 #include "utmp-wtmp.h"
97
98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
100
101 /* This assumes there is a 'tty' group */
102 #define TTY_MODE 0620
103
104 #define SNDBUF_SIZE (8*1024*1024)
105
106 static int shift_fds(int fds[], size_t n_fds) {
107 int start, restart_from;
108
109 if (n_fds <= 0)
110 return 0;
111
112 /* Modifies the fds array! (sorts it) */
113
114 assert(fds);
115
116 start = 0;
117 for (;;) {
118 int i;
119
120 restart_from = -1;
121
122 for (i = start; i < (int) n_fds; i++) {
123 int nfd;
124
125 /* Already at right index? */
126 if (fds[i] == i+3)
127 continue;
128
129 nfd = fcntl(fds[i], F_DUPFD, i + 3);
130 if (nfd < 0)
131 return -errno;
132
133 safe_close(fds[i]);
134 fds[i] = nfd;
135
136 /* Hmm, the fd we wanted isn't free? Then
137 * let's remember that and try again from here */
138 if (nfd != i+3 && restart_from < 0)
139 restart_from = i;
140 }
141
142 if (restart_from < 0)
143 break;
144
145 start = restart_from;
146 }
147
148 return 0;
149 }
150
151 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
152 size_t i, n_fds;
153 int r;
154
155 n_fds = n_socket_fds + n_storage_fds;
156 if (n_fds <= 0)
157 return 0;
158
159 assert(fds);
160
161 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
162 * O_NONBLOCK only applies to socket activation though. */
163
164 for (i = 0; i < n_fds; i++) {
165
166 if (i < n_socket_fds) {
167 r = fd_nonblock(fds[i], nonblock);
168 if (r < 0)
169 return r;
170 }
171
172 /* We unconditionally drop FD_CLOEXEC from the fds,
173 * since after all we want to pass these fds to our
174 * children */
175
176 r = fd_cloexec(fds[i], false);
177 if (r < 0)
178 return r;
179 }
180
181 return 0;
182 }
183
184 static const char *exec_context_tty_path(const ExecContext *context) {
185 assert(context);
186
187 if (context->stdio_as_fds)
188 return NULL;
189
190 if (context->tty_path)
191 return context->tty_path;
192
193 return "/dev/console";
194 }
195
196 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
197 const char *path;
198
199 assert(context);
200
201 path = exec_context_tty_path(context);
202
203 if (context->tty_vhangup) {
204 if (p && p->stdin_fd >= 0)
205 (void) terminal_vhangup_fd(p->stdin_fd);
206 else if (path)
207 (void) terminal_vhangup(path);
208 }
209
210 if (context->tty_reset) {
211 if (p && p->stdin_fd >= 0)
212 (void) reset_terminal_fd(p->stdin_fd, true);
213 else if (path)
214 (void) reset_terminal(path);
215 }
216
217 if (context->tty_vt_disallocate && path)
218 (void) vt_disallocate(path);
219 }
220
221 static bool is_terminal_input(ExecInput i) {
222 return IN_SET(i,
223 EXEC_INPUT_TTY,
224 EXEC_INPUT_TTY_FORCE,
225 EXEC_INPUT_TTY_FAIL);
226 }
227
228 static bool is_terminal_output(ExecOutput o) {
229 return IN_SET(o,
230 EXEC_OUTPUT_TTY,
231 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
232 EXEC_OUTPUT_KMSG_AND_CONSOLE,
233 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
234 }
235
236 static bool is_syslog_output(ExecOutput o) {
237 return IN_SET(o,
238 EXEC_OUTPUT_SYSLOG,
239 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
240 }
241
242 static bool is_kmsg_output(ExecOutput o) {
243 return IN_SET(o,
244 EXEC_OUTPUT_KMSG,
245 EXEC_OUTPUT_KMSG_AND_CONSOLE);
246 }
247
248 static bool exec_context_needs_term(const ExecContext *c) {
249 assert(c);
250
251 /* Return true if the execution context suggests we should set $TERM to something useful. */
252
253 if (is_terminal_input(c->std_input))
254 return true;
255
256 if (is_terminal_output(c->std_output))
257 return true;
258
259 if (is_terminal_output(c->std_error))
260 return true;
261
262 return !!c->tty_path;
263 }
264
265 static int open_null_as(int flags, int nfd) {
266 int fd;
267
268 assert(nfd >= 0);
269
270 fd = open("/dev/null", flags|O_NOCTTY);
271 if (fd < 0)
272 return -errno;
273
274 return move_fd(fd, nfd, false);
275 }
276
277 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
278 static const union sockaddr_union sa = {
279 .un.sun_family = AF_UNIX,
280 .un.sun_path = "/run/systemd/journal/stdout",
281 };
282 uid_t olduid = UID_INVALID;
283 gid_t oldgid = GID_INVALID;
284 int r;
285
286 if (gid_is_valid(gid)) {
287 oldgid = getgid();
288
289 if (setegid(gid) < 0)
290 return -errno;
291 }
292
293 if (uid_is_valid(uid)) {
294 olduid = getuid();
295
296 if (seteuid(uid) < 0) {
297 r = -errno;
298 goto restore_gid;
299 }
300 }
301
302 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
303
304 /* If we fail to restore the uid or gid, things will likely
305 fail later on. This should only happen if an LSM interferes. */
306
307 if (uid_is_valid(uid))
308 (void) seteuid(olduid);
309
310 restore_gid:
311 if (gid_is_valid(gid))
312 (void) setegid(oldgid);
313
314 return r;
315 }
316
317 static int connect_logger_as(
318 const Unit *unit,
319 const ExecContext *context,
320 const ExecParameters *params,
321 ExecOutput output,
322 const char *ident,
323 int nfd,
324 uid_t uid,
325 gid_t gid) {
326
327 _cleanup_close_ int fd = -1;
328 int r;
329
330 assert(context);
331 assert(params);
332 assert(output < _EXEC_OUTPUT_MAX);
333 assert(ident);
334 assert(nfd >= 0);
335
336 fd = socket(AF_UNIX, SOCK_STREAM, 0);
337 if (fd < 0)
338 return -errno;
339
340 r = connect_journal_socket(fd, uid, gid);
341 if (r < 0)
342 return r;
343
344 if (shutdown(fd, SHUT_RD) < 0)
345 return -errno;
346
347 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
348
349 if (dprintf(fd,
350 "%s\n"
351 "%s\n"
352 "%i\n"
353 "%i\n"
354 "%i\n"
355 "%i\n"
356 "%i\n",
357 context->syslog_identifier ?: ident,
358 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
359 context->syslog_priority,
360 !!context->syslog_level_prefix,
361 is_syslog_output(output),
362 is_kmsg_output(output),
363 is_terminal_output(output)) < 0)
364 return -errno;
365
366 return move_fd(TAKE_FD(fd), nfd, false);
367 }
368
369 static int open_terminal_as(const char *path, int flags, int nfd) {
370 int fd;
371
372 assert(path);
373 assert(nfd >= 0);
374
375 fd = open_terminal(path, flags | O_NOCTTY);
376 if (fd < 0)
377 return fd;
378
379 return move_fd(fd, nfd, false);
380 }
381
382 static int acquire_path(const char *path, int flags, mode_t mode) {
383 union sockaddr_union sa = {};
384 _cleanup_close_ int fd = -1;
385 int r, salen;
386
387 assert(path);
388
389 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
390 flags |= O_CREAT;
391
392 fd = open(path, flags|O_NOCTTY, mode);
393 if (fd >= 0)
394 return TAKE_FD(fd);
395
396 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
397 return -errno;
398 if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
399 return -ENXIO;
400
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
402
403 fd = socket(AF_UNIX, SOCK_STREAM, 0);
404 if (fd < 0)
405 return -errno;
406
407 salen = sockaddr_un_set_path(&sa.un, path);
408 if (salen < 0)
409 return salen;
410
411 if (connect(fd, &sa.sa, salen) < 0)
412 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
413 * indication that his wasn't an AF_UNIX socket after all */
414
415 if ((flags & O_ACCMODE) == O_RDONLY)
416 r = shutdown(fd, SHUT_WR);
417 else if ((flags & O_ACCMODE) == O_WRONLY)
418 r = shutdown(fd, SHUT_RD);
419 else
420 return TAKE_FD(fd);
421 if (r < 0)
422 return -errno;
423
424 return TAKE_FD(fd);
425 }
426
427 static int fixup_input(
428 const ExecContext *context,
429 int socket_fd,
430 bool apply_tty_stdin) {
431
432 ExecInput std_input;
433
434 assert(context);
435
436 std_input = context->std_input;
437
438 if (is_terminal_input(std_input) && !apply_tty_stdin)
439 return EXEC_INPUT_NULL;
440
441 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
442 return EXEC_INPUT_NULL;
443
444 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
445 return EXEC_INPUT_NULL;
446
447 return std_input;
448 }
449
450 static int fixup_output(ExecOutput std_output, int socket_fd) {
451
452 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
453 return EXEC_OUTPUT_INHERIT;
454
455 return std_output;
456 }
457
458 static int setup_input(
459 const ExecContext *context,
460 const ExecParameters *params,
461 int socket_fd,
462 int named_iofds[3]) {
463
464 ExecInput i;
465
466 assert(context);
467 assert(params);
468
469 if (params->stdin_fd >= 0) {
470 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
471 return -errno;
472
473 /* Try to make this the controlling tty, if it is a tty, and reset it */
474 if (isatty(STDIN_FILENO)) {
475 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
476 (void) reset_terminal_fd(STDIN_FILENO, true);
477 }
478
479 return STDIN_FILENO;
480 }
481
482 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
483
484 switch (i) {
485
486 case EXEC_INPUT_NULL:
487 return open_null_as(O_RDONLY, STDIN_FILENO);
488
489 case EXEC_INPUT_TTY:
490 case EXEC_INPUT_TTY_FORCE:
491 case EXEC_INPUT_TTY_FAIL: {
492 int fd;
493
494 fd = acquire_terminal(exec_context_tty_path(context),
495 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
496 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
497 ACQUIRE_TERMINAL_WAIT,
498 USEC_INFINITY);
499 if (fd < 0)
500 return fd;
501
502 return move_fd(fd, STDIN_FILENO, false);
503 }
504
505 case EXEC_INPUT_SOCKET:
506 assert(socket_fd >= 0);
507
508 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
509
510 case EXEC_INPUT_NAMED_FD:
511 assert(named_iofds[STDIN_FILENO] >= 0);
512
513 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
514 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
515
516 case EXEC_INPUT_DATA: {
517 int fd;
518
519 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
520 if (fd < 0)
521 return fd;
522
523 return move_fd(fd, STDIN_FILENO, false);
524 }
525
526 case EXEC_INPUT_FILE: {
527 bool rw;
528 int fd;
529
530 assert(context->stdio_file[STDIN_FILENO]);
531
532 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
533 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
534
535 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
536 if (fd < 0)
537 return fd;
538
539 return move_fd(fd, STDIN_FILENO, false);
540 }
541
542 default:
543 assert_not_reached("Unknown input type");
544 }
545 }
546
547 static bool can_inherit_stderr_from_stdout(
548 const ExecContext *context,
549 ExecOutput o,
550 ExecOutput e) {
551
552 assert(context);
553
554 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
555 * stderr fd */
556
557 if (e == EXEC_OUTPUT_INHERIT)
558 return true;
559 if (e != o)
560 return false;
561
562 if (e == EXEC_OUTPUT_NAMED_FD)
563 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
564
565 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
566 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
567
568 return true;
569 }
570
571 static int setup_output(
572 const Unit *unit,
573 const ExecContext *context,
574 const ExecParameters *params,
575 int fileno,
576 int socket_fd,
577 int named_iofds[3],
578 const char *ident,
579 uid_t uid,
580 gid_t gid,
581 dev_t *journal_stream_dev,
582 ino_t *journal_stream_ino) {
583
584 ExecOutput o;
585 ExecInput i;
586 int r;
587
588 assert(unit);
589 assert(context);
590 assert(params);
591 assert(ident);
592 assert(journal_stream_dev);
593 assert(journal_stream_ino);
594
595 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
596
597 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
598 return -errno;
599
600 return STDOUT_FILENO;
601 }
602
603 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
604 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
605 return -errno;
606
607 return STDERR_FILENO;
608 }
609
610 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
611 o = fixup_output(context->std_output, socket_fd);
612
613 if (fileno == STDERR_FILENO) {
614 ExecOutput e;
615 e = fixup_output(context->std_error, socket_fd);
616
617 /* This expects the input and output are already set up */
618
619 /* Don't change the stderr file descriptor if we inherit all
620 * the way and are not on a tty */
621 if (e == EXEC_OUTPUT_INHERIT &&
622 o == EXEC_OUTPUT_INHERIT &&
623 i == EXEC_INPUT_NULL &&
624 !is_terminal_input(context->std_input) &&
625 getppid () != 1)
626 return fileno;
627
628 /* Duplicate from stdout if possible */
629 if (can_inherit_stderr_from_stdout(context, o, e))
630 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
631
632 o = e;
633
634 } else if (o == EXEC_OUTPUT_INHERIT) {
635 /* If input got downgraded, inherit the original value */
636 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
637 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
638
639 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
640 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
641 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
642
643 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
644 if (getppid() != 1)
645 return fileno;
646
647 /* We need to open /dev/null here anew, to get the right access mode. */
648 return open_null_as(O_WRONLY, fileno);
649 }
650
651 switch (o) {
652
653 case EXEC_OUTPUT_NULL:
654 return open_null_as(O_WRONLY, fileno);
655
656 case EXEC_OUTPUT_TTY:
657 if (is_terminal_input(i))
658 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
659
660 /* We don't reset the terminal if this is just about output */
661 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
662
663 case EXEC_OUTPUT_SYSLOG:
664 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
665 case EXEC_OUTPUT_KMSG:
666 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
667 case EXEC_OUTPUT_JOURNAL:
668 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
669 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
670 if (r < 0) {
671 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
672 r = open_null_as(O_WRONLY, fileno);
673 } else {
674 struct stat st;
675
676 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
677 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
678 * services to detect whether they are connected to the journal or not.
679 *
680 * If both stdout and stderr are connected to a stream then let's make sure to store the data
681 * about STDERR as that's usually the best way to do logging. */
682
683 if (fstat(fileno, &st) >= 0 &&
684 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
685 *journal_stream_dev = st.st_dev;
686 *journal_stream_ino = st.st_ino;
687 }
688 }
689 return r;
690
691 case EXEC_OUTPUT_SOCKET:
692 assert(socket_fd >= 0);
693
694 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
695
696 case EXEC_OUTPUT_NAMED_FD:
697 assert(named_iofds[fileno] >= 0);
698
699 (void) fd_nonblock(named_iofds[fileno], false);
700 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
701
702 case EXEC_OUTPUT_FILE:
703 case EXEC_OUTPUT_FILE_APPEND: {
704 bool rw;
705 int fd, flags;
706
707 assert(context->stdio_file[fileno]);
708
709 rw = context->std_input == EXEC_INPUT_FILE &&
710 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
711
712 if (rw)
713 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
714
715 flags = O_WRONLY;
716 if (o == EXEC_OUTPUT_FILE_APPEND)
717 flags |= O_APPEND;
718
719 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
720 if (fd < 0)
721 return fd;
722
723 return move_fd(fd, fileno, 0);
724 }
725
726 default:
727 assert_not_reached("Unknown error type");
728 }
729 }
730
731 static int chown_terminal(int fd, uid_t uid) {
732 struct stat st;
733
734 assert(fd >= 0);
735
736 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
737 if (isatty(fd) < 1)
738 return 0;
739
740 /* This might fail. What matters are the results. */
741 (void) fchown(fd, uid, -1);
742 (void) fchmod(fd, TTY_MODE);
743
744 if (fstat(fd, &st) < 0)
745 return -errno;
746
747 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
748 return -EPERM;
749
750 return 0;
751 }
752
753 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
754 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
755 int r;
756
757 assert(_saved_stdin);
758 assert(_saved_stdout);
759
760 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
761 if (saved_stdin < 0)
762 return -errno;
763
764 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
765 if (saved_stdout < 0)
766 return -errno;
767
768 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
769 if (fd < 0)
770 return fd;
771
772 r = chown_terminal(fd, getuid());
773 if (r < 0)
774 return r;
775
776 r = reset_terminal_fd(fd, true);
777 if (r < 0)
778 return r;
779
780 r = rearrange_stdio(fd, fd, STDERR_FILENO);
781 fd = -1;
782 if (r < 0)
783 return r;
784
785 *_saved_stdin = saved_stdin;
786 *_saved_stdout = saved_stdout;
787
788 saved_stdin = saved_stdout = -1;
789
790 return 0;
791 }
792
793 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
794 assert(err < 0);
795
796 if (err == -ETIMEDOUT)
797 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
798 else {
799 errno = -err;
800 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
801 }
802 }
803
804 static void write_confirm_error(int err, const char *vc, const Unit *u) {
805 _cleanup_close_ int fd = -1;
806
807 assert(vc);
808
809 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
810 if (fd < 0)
811 return;
812
813 write_confirm_error_fd(err, fd, u);
814 }
815
816 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
817 int r = 0;
818
819 assert(saved_stdin);
820 assert(saved_stdout);
821
822 release_terminal();
823
824 if (*saved_stdin >= 0)
825 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
826 r = -errno;
827
828 if (*saved_stdout >= 0)
829 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
830 r = -errno;
831
832 *saved_stdin = safe_close(*saved_stdin);
833 *saved_stdout = safe_close(*saved_stdout);
834
835 return r;
836 }
837
838 enum {
839 CONFIRM_PRETEND_FAILURE = -1,
840 CONFIRM_PRETEND_SUCCESS = 0,
841 CONFIRM_EXECUTE = 1,
842 };
843
844 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
845 int saved_stdout = -1, saved_stdin = -1, r;
846 _cleanup_free_ char *e = NULL;
847 char c;
848
849 /* For any internal errors, assume a positive response. */
850 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
851 if (r < 0) {
852 write_confirm_error(r, vc, u);
853 return CONFIRM_EXECUTE;
854 }
855
856 /* confirm_spawn might have been disabled while we were sleeping. */
857 if (manager_is_confirm_spawn_disabled(u->manager)) {
858 r = 1;
859 goto restore_stdio;
860 }
861
862 e = ellipsize(cmdline, 60, 100);
863 if (!e) {
864 log_oom();
865 r = CONFIRM_EXECUTE;
866 goto restore_stdio;
867 }
868
869 for (;;) {
870 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
871 if (r < 0) {
872 write_confirm_error_fd(r, STDOUT_FILENO, u);
873 r = CONFIRM_EXECUTE;
874 goto restore_stdio;
875 }
876
877 switch (c) {
878 case 'c':
879 printf("Resuming normal execution.\n");
880 manager_disable_confirm_spawn();
881 r = 1;
882 break;
883 case 'D':
884 unit_dump(u, stdout, " ");
885 continue; /* ask again */
886 case 'f':
887 printf("Failing execution.\n");
888 r = CONFIRM_PRETEND_FAILURE;
889 break;
890 case 'h':
891 printf(" c - continue, proceed without asking anymore\n"
892 " D - dump, show the state of the unit\n"
893 " f - fail, don't execute the command and pretend it failed\n"
894 " h - help\n"
895 " i - info, show a short summary of the unit\n"
896 " j - jobs, show jobs that are in progress\n"
897 " s - skip, don't execute the command and pretend it succeeded\n"
898 " y - yes, execute the command\n");
899 continue; /* ask again */
900 case 'i':
901 printf(" Description: %s\n"
902 " Unit: %s\n"
903 " Command: %s\n",
904 u->id, u->description, cmdline);
905 continue; /* ask again */
906 case 'j':
907 manager_dump_jobs(u->manager, stdout, " ");
908 continue; /* ask again */
909 case 'n':
910 /* 'n' was removed in favor of 'f'. */
911 printf("Didn't understand 'n', did you mean 'f'?\n");
912 continue; /* ask again */
913 case 's':
914 printf("Skipping execution.\n");
915 r = CONFIRM_PRETEND_SUCCESS;
916 break;
917 case 'y':
918 r = CONFIRM_EXECUTE;
919 break;
920 default:
921 assert_not_reached("Unhandled choice");
922 }
923 break;
924 }
925
926 restore_stdio:
927 restore_confirm_stdio(&saved_stdin, &saved_stdout);
928 return r;
929 }
930
931 static int get_fixed_user(const ExecContext *c, const char **user,
932 uid_t *uid, gid_t *gid,
933 const char **home, const char **shell) {
934 int r;
935 const char *name;
936
937 assert(c);
938
939 if (!c->user)
940 return 0;
941
942 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
943 * (i.e. are "/" or "/bin/nologin"). */
944
945 name = c->user;
946 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
947 if (r < 0)
948 return r;
949
950 *user = name;
951 return 0;
952 }
953
954 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
955 int r;
956 const char *name;
957
958 assert(c);
959
960 if (!c->group)
961 return 0;
962
963 name = c->group;
964 r = get_group_creds(&name, gid, 0);
965 if (r < 0)
966 return r;
967
968 *group = name;
969 return 0;
970 }
971
972 static int get_supplementary_groups(const ExecContext *c, const char *user,
973 const char *group, gid_t gid,
974 gid_t **supplementary_gids, int *ngids) {
975 char **i;
976 int r, k = 0;
977 int ngroups_max;
978 bool keep_groups = false;
979 gid_t *groups = NULL;
980 _cleanup_free_ gid_t *l_gids = NULL;
981
982 assert(c);
983
984 /*
985 * If user is given, then lookup GID and supplementary groups list.
986 * We avoid NSS lookups for gid=0. Also we have to initialize groups
987 * here and as early as possible so we keep the list of supplementary
988 * groups of the caller.
989 */
990 if (user && gid_is_valid(gid) && gid != 0) {
991 /* First step, initialize groups from /etc/groups */
992 if (initgroups(user, gid) < 0)
993 return -errno;
994
995 keep_groups = true;
996 }
997
998 if (strv_isempty(c->supplementary_groups))
999 return 0;
1000
1001 /*
1002 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1003 * be positive, otherwise fail.
1004 */
1005 errno = 0;
1006 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1007 if (ngroups_max <= 0) {
1008 if (errno > 0)
1009 return -errno;
1010 else
1011 return -EOPNOTSUPP; /* For all other values */
1012 }
1013
1014 l_gids = new(gid_t, ngroups_max);
1015 if (!l_gids)
1016 return -ENOMEM;
1017
1018 if (keep_groups) {
1019 /*
1020 * Lookup the list of groups that the user belongs to, we
1021 * avoid NSS lookups here too for gid=0.
1022 */
1023 k = ngroups_max;
1024 if (getgrouplist(user, gid, l_gids, &k) < 0)
1025 return -EINVAL;
1026 } else
1027 k = 0;
1028
1029 STRV_FOREACH(i, c->supplementary_groups) {
1030 const char *g;
1031
1032 if (k >= ngroups_max)
1033 return -E2BIG;
1034
1035 g = *i;
1036 r = get_group_creds(&g, l_gids+k, 0);
1037 if (r < 0)
1038 return r;
1039
1040 k++;
1041 }
1042
1043 /*
1044 * Sets ngids to zero to drop all supplementary groups, happens
1045 * when we are under root and SupplementaryGroups= is empty.
1046 */
1047 if (k == 0) {
1048 *ngids = 0;
1049 return 0;
1050 }
1051
1052 /* Otherwise get the final list of supplementary groups */
1053 groups = memdup(l_gids, sizeof(gid_t) * k);
1054 if (!groups)
1055 return -ENOMEM;
1056
1057 *supplementary_gids = groups;
1058 *ngids = k;
1059
1060 groups = NULL;
1061
1062 return 0;
1063 }
1064
1065 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1066 int r;
1067
1068 /* Handle SupplementaryGroups= if it is not empty */
1069 if (ngids > 0) {
1070 r = maybe_setgroups(ngids, supplementary_gids);
1071 if (r < 0)
1072 return r;
1073 }
1074
1075 if (gid_is_valid(gid)) {
1076 /* Then set our gids */
1077 if (setresgid(gid, gid, gid) < 0)
1078 return -errno;
1079 }
1080
1081 return 0;
1082 }
1083
1084 static int enforce_user(const ExecContext *context, uid_t uid) {
1085 assert(context);
1086
1087 if (!uid_is_valid(uid))
1088 return 0;
1089
1090 /* Sets (but doesn't look up) the uid and make sure we keep the
1091 * capabilities while doing so. */
1092
1093 if (context->capability_ambient_set != 0) {
1094
1095 /* First step: If we need to keep capabilities but
1096 * drop privileges we need to make sure we keep our
1097 * caps, while we drop privileges. */
1098 if (uid != 0) {
1099 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1100
1101 if (prctl(PR_GET_SECUREBITS) != sb)
1102 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1103 return -errno;
1104 }
1105 }
1106
1107 /* Second step: actually set the uids */
1108 if (setresuid(uid, uid, uid) < 0)
1109 return -errno;
1110
1111 /* At this point we should have all necessary capabilities but
1112 are otherwise a normal user. However, the caps might got
1113 corrupted due to the setresuid() so we need clean them up
1114 later. This is done outside of this call. */
1115
1116 return 0;
1117 }
1118
1119 #if HAVE_PAM
1120
1121 static int null_conv(
1122 int num_msg,
1123 const struct pam_message **msg,
1124 struct pam_response **resp,
1125 void *appdata_ptr) {
1126
1127 /* We don't support conversations */
1128
1129 return PAM_CONV_ERR;
1130 }
1131
1132 #endif
1133
1134 static int setup_pam(
1135 const char *name,
1136 const char *user,
1137 uid_t uid,
1138 gid_t gid,
1139 const char *tty,
1140 char ***env,
1141 int fds[], size_t n_fds) {
1142
1143 #if HAVE_PAM
1144
1145 static const struct pam_conv conv = {
1146 .conv = null_conv,
1147 .appdata_ptr = NULL
1148 };
1149
1150 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1151 pam_handle_t *handle = NULL;
1152 sigset_t old_ss;
1153 int pam_code = PAM_SUCCESS, r;
1154 char **nv, **e = NULL;
1155 bool close_session = false;
1156 pid_t pam_pid = 0, parent_pid;
1157 int flags = 0;
1158
1159 assert(name);
1160 assert(user);
1161 assert(env);
1162
1163 /* We set up PAM in the parent process, then fork. The child
1164 * will then stay around until killed via PR_GET_PDEATHSIG or
1165 * systemd via the cgroup logic. It will then remove the PAM
1166 * session again. The parent process will exec() the actual
1167 * daemon. We do things this way to ensure that the main PID
1168 * of the daemon is the one we initially fork()ed. */
1169
1170 r = barrier_create(&barrier);
1171 if (r < 0)
1172 goto fail;
1173
1174 if (log_get_max_level() < LOG_DEBUG)
1175 flags |= PAM_SILENT;
1176
1177 pam_code = pam_start(name, user, &conv, &handle);
1178 if (pam_code != PAM_SUCCESS) {
1179 handle = NULL;
1180 goto fail;
1181 }
1182
1183 if (!tty) {
1184 _cleanup_free_ char *q = NULL;
1185
1186 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1187 * out if that's the case, and read the TTY off it. */
1188
1189 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1190 tty = strjoina("/dev/", q);
1191 }
1192
1193 if (tty) {
1194 pam_code = pam_set_item(handle, PAM_TTY, tty);
1195 if (pam_code != PAM_SUCCESS)
1196 goto fail;
1197 }
1198
1199 STRV_FOREACH(nv, *env) {
1200 pam_code = pam_putenv(handle, *nv);
1201 if (pam_code != PAM_SUCCESS)
1202 goto fail;
1203 }
1204
1205 pam_code = pam_acct_mgmt(handle, flags);
1206 if (pam_code != PAM_SUCCESS)
1207 goto fail;
1208
1209 pam_code = pam_open_session(handle, flags);
1210 if (pam_code != PAM_SUCCESS)
1211 goto fail;
1212
1213 close_session = true;
1214
1215 e = pam_getenvlist(handle);
1216 if (!e) {
1217 pam_code = PAM_BUF_ERR;
1218 goto fail;
1219 }
1220
1221 /* Block SIGTERM, so that we know that it won't get lost in
1222 * the child */
1223
1224 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1225
1226 parent_pid = getpid_cached();
1227
1228 r = safe_fork("(sd-pam)", 0, &pam_pid);
1229 if (r < 0)
1230 goto fail;
1231 if (r == 0) {
1232 int sig, ret = EXIT_PAM;
1233
1234 /* The child's job is to reset the PAM session on
1235 * termination */
1236 barrier_set_role(&barrier, BARRIER_CHILD);
1237
1238 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1239 * are open here that have been opened by PAM. */
1240 (void) close_many(fds, n_fds);
1241
1242 /* Drop privileges - we don't need any to pam_close_session
1243 * and this will make PR_SET_PDEATHSIG work in most cases.
1244 * If this fails, ignore the error - but expect sd-pam threads
1245 * to fail to exit normally */
1246
1247 r = maybe_setgroups(0, NULL);
1248 if (r < 0)
1249 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1250 if (setresgid(gid, gid, gid) < 0)
1251 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1252 if (setresuid(uid, uid, uid) < 0)
1253 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1254
1255 (void) ignore_signals(SIGPIPE, -1);
1256
1257 /* Wait until our parent died. This will only work if
1258 * the above setresuid() succeeds, otherwise the kernel
1259 * will not allow unprivileged parents kill their privileged
1260 * children this way. We rely on the control groups kill logic
1261 * to do the rest for us. */
1262 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1263 goto child_finish;
1264
1265 /* Tell the parent that our setup is done. This is especially
1266 * important regarding dropping privileges. Otherwise, unit
1267 * setup might race against our setresuid(2) call.
1268 *
1269 * If the parent aborted, we'll detect this below, hence ignore
1270 * return failure here. */
1271 (void) barrier_place(&barrier);
1272
1273 /* Check if our parent process might already have died? */
1274 if (getppid() == parent_pid) {
1275 sigset_t ss;
1276
1277 assert_se(sigemptyset(&ss) >= 0);
1278 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1279
1280 for (;;) {
1281 if (sigwait(&ss, &sig) < 0) {
1282 if (errno == EINTR)
1283 continue;
1284
1285 goto child_finish;
1286 }
1287
1288 assert(sig == SIGTERM);
1289 break;
1290 }
1291 }
1292
1293 /* If our parent died we'll end the session */
1294 if (getppid() != parent_pid) {
1295 pam_code = pam_close_session(handle, flags);
1296 if (pam_code != PAM_SUCCESS)
1297 goto child_finish;
1298 }
1299
1300 ret = 0;
1301
1302 child_finish:
1303 pam_end(handle, pam_code | flags);
1304 _exit(ret);
1305 }
1306
1307 barrier_set_role(&barrier, BARRIER_PARENT);
1308
1309 /* If the child was forked off successfully it will do all the
1310 * cleanups, so forget about the handle here. */
1311 handle = NULL;
1312
1313 /* Unblock SIGTERM again in the parent */
1314 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1315
1316 /* We close the log explicitly here, since the PAM modules
1317 * might have opened it, but we don't want this fd around. */
1318 closelog();
1319
1320 /* Synchronously wait for the child to initialize. We don't care for
1321 * errors as we cannot recover. However, warn loudly if it happens. */
1322 if (!barrier_place_and_sync(&barrier))
1323 log_error("PAM initialization failed");
1324
1325 return strv_free_and_replace(*env, e);
1326
1327 fail:
1328 if (pam_code != PAM_SUCCESS) {
1329 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1330 r = -EPERM; /* PAM errors do not map to errno */
1331 } else
1332 log_error_errno(r, "PAM failed: %m");
1333
1334 if (handle) {
1335 if (close_session)
1336 pam_code = pam_close_session(handle, flags);
1337
1338 pam_end(handle, pam_code | flags);
1339 }
1340
1341 strv_free(e);
1342 closelog();
1343
1344 return r;
1345 #else
1346 return 0;
1347 #endif
1348 }
1349
1350 static void rename_process_from_path(const char *path) {
1351 char process_name[11];
1352 const char *p;
1353 size_t l;
1354
1355 /* This resulting string must fit in 10 chars (i.e. the length
1356 * of "/sbin/init") to look pretty in /bin/ps */
1357
1358 p = basename(path);
1359 if (isempty(p)) {
1360 rename_process("(...)");
1361 return;
1362 }
1363
1364 l = strlen(p);
1365 if (l > 8) {
1366 /* The end of the process name is usually more
1367 * interesting, since the first bit might just be
1368 * "systemd-" */
1369 p = p + l - 8;
1370 l = 8;
1371 }
1372
1373 process_name[0] = '(';
1374 memcpy(process_name+1, p, l);
1375 process_name[1+l] = ')';
1376 process_name[1+l+1] = 0;
1377
1378 rename_process(process_name);
1379 }
1380
1381 static bool context_has_address_families(const ExecContext *c) {
1382 assert(c);
1383
1384 return c->address_families_whitelist ||
1385 !set_isempty(c->address_families);
1386 }
1387
1388 static bool context_has_syscall_filters(const ExecContext *c) {
1389 assert(c);
1390
1391 return c->syscall_whitelist ||
1392 !hashmap_isempty(c->syscall_filter);
1393 }
1394
1395 static bool context_has_no_new_privileges(const ExecContext *c) {
1396 assert(c);
1397
1398 if (c->no_new_privileges)
1399 return true;
1400
1401 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1402 return false;
1403
1404 /* We need NNP if we have any form of seccomp and are unprivileged */
1405 return context_has_address_families(c) ||
1406 c->memory_deny_write_execute ||
1407 c->restrict_realtime ||
1408 exec_context_restrict_namespaces_set(c) ||
1409 c->protect_kernel_tunables ||
1410 c->protect_kernel_modules ||
1411 c->private_devices ||
1412 context_has_syscall_filters(c) ||
1413 !set_isempty(c->syscall_archs) ||
1414 c->lock_personality;
1415 }
1416
1417 #if HAVE_SECCOMP
1418
1419 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1420
1421 if (is_seccomp_available())
1422 return false;
1423
1424 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1425 return true;
1426 }
1427
1428 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1429 uint32_t negative_action, default_action, action;
1430 int r;
1431
1432 assert(u);
1433 assert(c);
1434
1435 if (!context_has_syscall_filters(c))
1436 return 0;
1437
1438 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1439 return 0;
1440
1441 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1442
1443 if (c->syscall_whitelist) {
1444 default_action = negative_action;
1445 action = SCMP_ACT_ALLOW;
1446 } else {
1447 default_action = SCMP_ACT_ALLOW;
1448 action = negative_action;
1449 }
1450
1451 if (needs_ambient_hack) {
1452 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1453 if (r < 0)
1454 return r;
1455 }
1456
1457 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1458 }
1459
1460 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1461 assert(u);
1462 assert(c);
1463
1464 if (set_isempty(c->syscall_archs))
1465 return 0;
1466
1467 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1468 return 0;
1469
1470 return seccomp_restrict_archs(c->syscall_archs);
1471 }
1472
1473 static int apply_address_families(const Unit* u, const ExecContext *c) {
1474 assert(u);
1475 assert(c);
1476
1477 if (!context_has_address_families(c))
1478 return 0;
1479
1480 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1481 return 0;
1482
1483 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1484 }
1485
1486 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1487 assert(u);
1488 assert(c);
1489
1490 if (!c->memory_deny_write_execute)
1491 return 0;
1492
1493 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1494 return 0;
1495
1496 return seccomp_memory_deny_write_execute();
1497 }
1498
1499 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1500 assert(u);
1501 assert(c);
1502
1503 if (!c->restrict_realtime)
1504 return 0;
1505
1506 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1507 return 0;
1508
1509 return seccomp_restrict_realtime();
1510 }
1511
1512 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1513 assert(u);
1514 assert(c);
1515
1516 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1517 * let's protect even those systems where this is left on in the kernel. */
1518
1519 if (!c->protect_kernel_tunables)
1520 return 0;
1521
1522 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1523 return 0;
1524
1525 return seccomp_protect_sysctl();
1526 }
1527
1528 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1529 assert(u);
1530 assert(c);
1531
1532 /* Turn off module syscalls on ProtectKernelModules=yes */
1533
1534 if (!c->protect_kernel_modules)
1535 return 0;
1536
1537 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1538 return 0;
1539
1540 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1541 }
1542
1543 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1544 assert(u);
1545 assert(c);
1546
1547 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1548
1549 if (!c->private_devices)
1550 return 0;
1551
1552 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1553 return 0;
1554
1555 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1556 }
1557
1558 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1559 assert(u);
1560 assert(c);
1561
1562 if (!exec_context_restrict_namespaces_set(c))
1563 return 0;
1564
1565 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1566 return 0;
1567
1568 return seccomp_restrict_namespaces(c->restrict_namespaces);
1569 }
1570
1571 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1572 unsigned long personality;
1573 int r;
1574
1575 assert(u);
1576 assert(c);
1577
1578 if (!c->lock_personality)
1579 return 0;
1580
1581 if (skip_seccomp_unavailable(u, "LockPersonality="))
1582 return 0;
1583
1584 personality = c->personality;
1585
1586 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1587 if (personality == PERSONALITY_INVALID) {
1588
1589 r = opinionated_personality(&personality);
1590 if (r < 0)
1591 return r;
1592 }
1593
1594 return seccomp_lock_personality(personality);
1595 }
1596
1597 #endif
1598
1599 static void do_idle_pipe_dance(int idle_pipe[4]) {
1600 assert(idle_pipe);
1601
1602 idle_pipe[1] = safe_close(idle_pipe[1]);
1603 idle_pipe[2] = safe_close(idle_pipe[2]);
1604
1605 if (idle_pipe[0] >= 0) {
1606 int r;
1607
1608 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1609
1610 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1611 ssize_t n;
1612
1613 /* Signal systemd that we are bored and want to continue. */
1614 n = write(idle_pipe[3], "x", 1);
1615 if (n > 0)
1616 /* Wait for systemd to react to the signal above. */
1617 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1618 }
1619
1620 idle_pipe[0] = safe_close(idle_pipe[0]);
1621
1622 }
1623
1624 idle_pipe[3] = safe_close(idle_pipe[3]);
1625 }
1626
1627 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1628
1629 static int build_environment(
1630 const Unit *u,
1631 const ExecContext *c,
1632 const ExecParameters *p,
1633 size_t n_fds,
1634 const char *home,
1635 const char *username,
1636 const char *shell,
1637 dev_t journal_stream_dev,
1638 ino_t journal_stream_ino,
1639 char ***ret) {
1640
1641 _cleanup_strv_free_ char **our_env = NULL;
1642 ExecDirectoryType t;
1643 size_t n_env = 0;
1644 char *x;
1645
1646 assert(u);
1647 assert(c);
1648 assert(p);
1649 assert(ret);
1650
1651 our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1652 if (!our_env)
1653 return -ENOMEM;
1654
1655 if (n_fds > 0) {
1656 _cleanup_free_ char *joined = NULL;
1657
1658 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1659 return -ENOMEM;
1660 our_env[n_env++] = x;
1661
1662 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1663 return -ENOMEM;
1664 our_env[n_env++] = x;
1665
1666 joined = strv_join(p->fd_names, ":");
1667 if (!joined)
1668 return -ENOMEM;
1669
1670 x = strjoin("LISTEN_FDNAMES=", joined);
1671 if (!x)
1672 return -ENOMEM;
1673 our_env[n_env++] = x;
1674 }
1675
1676 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1677 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1678 return -ENOMEM;
1679 our_env[n_env++] = x;
1680
1681 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1682 return -ENOMEM;
1683 our_env[n_env++] = x;
1684 }
1685
1686 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1687 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1688 * check the database directly. */
1689 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1690 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1691 if (!x)
1692 return -ENOMEM;
1693 our_env[n_env++] = x;
1694 }
1695
1696 if (home) {
1697 x = strappend("HOME=", home);
1698 if (!x)
1699 return -ENOMEM;
1700 our_env[n_env++] = x;
1701 }
1702
1703 if (username) {
1704 x = strappend("LOGNAME=", username);
1705 if (!x)
1706 return -ENOMEM;
1707 our_env[n_env++] = x;
1708
1709 x = strappend("USER=", username);
1710 if (!x)
1711 return -ENOMEM;
1712 our_env[n_env++] = x;
1713 }
1714
1715 if (shell) {
1716 x = strappend("SHELL=", shell);
1717 if (!x)
1718 return -ENOMEM;
1719 our_env[n_env++] = x;
1720 }
1721
1722 if (!sd_id128_is_null(u->invocation_id)) {
1723 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1724 return -ENOMEM;
1725
1726 our_env[n_env++] = x;
1727 }
1728
1729 if (exec_context_needs_term(c)) {
1730 const char *tty_path, *term = NULL;
1731
1732 tty_path = exec_context_tty_path(c);
1733
1734 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1735 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1736 * passes to PID 1 ends up all the way in the console login shown. */
1737
1738 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1739 term = getenv("TERM");
1740 if (!term)
1741 term = default_term_for_tty(tty_path);
1742
1743 x = strappend("TERM=", term);
1744 if (!x)
1745 return -ENOMEM;
1746 our_env[n_env++] = x;
1747 }
1748
1749 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1750 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1751 return -ENOMEM;
1752
1753 our_env[n_env++] = x;
1754 }
1755
1756 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1757 _cleanup_free_ char *pre = NULL, *joined = NULL;
1758 const char *n;
1759
1760 if (!p->prefix[t])
1761 continue;
1762
1763 if (strv_isempty(c->directories[t].paths))
1764 continue;
1765
1766 n = exec_directory_env_name_to_string(t);
1767 if (!n)
1768 continue;
1769
1770 pre = strjoin(p->prefix[t], "/");
1771 if (!pre)
1772 return -ENOMEM;
1773
1774 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1775 if (!joined)
1776 return -ENOMEM;
1777
1778 x = strjoin(n, "=", joined);
1779 if (!x)
1780 return -ENOMEM;
1781
1782 our_env[n_env++] = x;
1783 }
1784
1785 our_env[n_env++] = NULL;
1786 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1787
1788 *ret = TAKE_PTR(our_env);
1789
1790 return 0;
1791 }
1792
1793 static int build_pass_environment(const ExecContext *c, char ***ret) {
1794 _cleanup_strv_free_ char **pass_env = NULL;
1795 size_t n_env = 0, n_bufsize = 0;
1796 char **i;
1797
1798 STRV_FOREACH(i, c->pass_environment) {
1799 _cleanup_free_ char *x = NULL;
1800 char *v;
1801
1802 v = getenv(*i);
1803 if (!v)
1804 continue;
1805 x = strjoin(*i, "=", v);
1806 if (!x)
1807 return -ENOMEM;
1808
1809 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1810 return -ENOMEM;
1811
1812 pass_env[n_env++] = TAKE_PTR(x);
1813 pass_env[n_env] = NULL;
1814 }
1815
1816 *ret = TAKE_PTR(pass_env);
1817
1818 return 0;
1819 }
1820
1821 static bool exec_needs_mount_namespace(
1822 const ExecContext *context,
1823 const ExecParameters *params,
1824 const ExecRuntime *runtime) {
1825
1826 assert(context);
1827 assert(params);
1828
1829 if (context->root_image)
1830 return true;
1831
1832 if (!strv_isempty(context->read_write_paths) ||
1833 !strv_isempty(context->read_only_paths) ||
1834 !strv_isempty(context->inaccessible_paths))
1835 return true;
1836
1837 if (context->n_bind_mounts > 0)
1838 return true;
1839
1840 if (context->n_temporary_filesystems > 0)
1841 return true;
1842
1843 if (context->mount_flags != 0)
1844 return true;
1845
1846 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1847 return true;
1848
1849 if (context->private_devices ||
1850 context->private_mounts ||
1851 context->protect_system != PROTECT_SYSTEM_NO ||
1852 context->protect_home != PROTECT_HOME_NO ||
1853 context->protect_kernel_tunables ||
1854 context->protect_kernel_modules ||
1855 context->protect_control_groups)
1856 return true;
1857
1858 if (context->root_directory) {
1859 ExecDirectoryType t;
1860
1861 if (context->mount_apivfs)
1862 return true;
1863
1864 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1865 if (!params->prefix[t])
1866 continue;
1867
1868 if (!strv_isempty(context->directories[t].paths))
1869 return true;
1870 }
1871 }
1872
1873 if (context->dynamic_user &&
1874 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1875 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1876 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1877 return true;
1878
1879 return false;
1880 }
1881
1882 static int setup_private_users(uid_t uid, gid_t gid) {
1883 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1884 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1885 _cleanup_close_ int unshare_ready_fd = -1;
1886 _cleanup_(sigkill_waitp) pid_t pid = 0;
1887 uint64_t c = 1;
1888 ssize_t n;
1889 int r;
1890
1891 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1892 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1893 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1894 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1895 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1896 * continues execution normally. */
1897
1898 if (uid != 0 && uid_is_valid(uid)) {
1899 r = asprintf(&uid_map,
1900 "0 0 1\n" /* Map root → root */
1901 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1902 uid, uid);
1903 if (r < 0)
1904 return -ENOMEM;
1905 } else {
1906 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1907 if (!uid_map)
1908 return -ENOMEM;
1909 }
1910
1911 if (gid != 0 && gid_is_valid(gid)) {
1912 r = asprintf(&gid_map,
1913 "0 0 1\n" /* Map root → root */
1914 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1915 gid, gid);
1916 if (r < 0)
1917 return -ENOMEM;
1918 } else {
1919 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1920 if (!gid_map)
1921 return -ENOMEM;
1922 }
1923
1924 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1925 * namespace. */
1926 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1927 if (unshare_ready_fd < 0)
1928 return -errno;
1929
1930 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1931 * failed. */
1932 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1933 return -errno;
1934
1935 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1936 if (r < 0)
1937 return r;
1938 if (r == 0) {
1939 _cleanup_close_ int fd = -1;
1940 const char *a;
1941 pid_t ppid;
1942
1943 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1944 * here, after the parent opened its own user namespace. */
1945
1946 ppid = getppid();
1947 errno_pipe[0] = safe_close(errno_pipe[0]);
1948
1949 /* Wait until the parent unshared the user namespace */
1950 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1951 r = -errno;
1952 goto child_fail;
1953 }
1954
1955 /* Disable the setgroups() system call in the child user namespace, for good. */
1956 a = procfs_file_alloca(ppid, "setgroups");
1957 fd = open(a, O_WRONLY|O_CLOEXEC);
1958 if (fd < 0) {
1959 if (errno != ENOENT) {
1960 r = -errno;
1961 goto child_fail;
1962 }
1963
1964 /* If the file is missing the kernel is too old, let's continue anyway. */
1965 } else {
1966 if (write(fd, "deny\n", 5) < 0) {
1967 r = -errno;
1968 goto child_fail;
1969 }
1970
1971 fd = safe_close(fd);
1972 }
1973
1974 /* First write the GID map */
1975 a = procfs_file_alloca(ppid, "gid_map");
1976 fd = open(a, O_WRONLY|O_CLOEXEC);
1977 if (fd < 0) {
1978 r = -errno;
1979 goto child_fail;
1980 }
1981 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1982 r = -errno;
1983 goto child_fail;
1984 }
1985 fd = safe_close(fd);
1986
1987 /* The write the UID map */
1988 a = procfs_file_alloca(ppid, "uid_map");
1989 fd = open(a, O_WRONLY|O_CLOEXEC);
1990 if (fd < 0) {
1991 r = -errno;
1992 goto child_fail;
1993 }
1994 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1995 r = -errno;
1996 goto child_fail;
1997 }
1998
1999 _exit(EXIT_SUCCESS);
2000
2001 child_fail:
2002 (void) write(errno_pipe[1], &r, sizeof(r));
2003 _exit(EXIT_FAILURE);
2004 }
2005
2006 errno_pipe[1] = safe_close(errno_pipe[1]);
2007
2008 if (unshare(CLONE_NEWUSER) < 0)
2009 return -errno;
2010
2011 /* Let the child know that the namespace is ready now */
2012 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2013 return -errno;
2014
2015 /* Try to read an error code from the child */
2016 n = read(errno_pipe[0], &r, sizeof(r));
2017 if (n < 0)
2018 return -errno;
2019 if (n == sizeof(r)) { /* an error code was sent to us */
2020 if (r < 0)
2021 return r;
2022 return -EIO;
2023 }
2024 if (n != 0) /* on success we should have read 0 bytes */
2025 return -EIO;
2026
2027 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2028 pid = 0;
2029 if (r < 0)
2030 return r;
2031 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2032 return -EIO;
2033
2034 return 0;
2035 }
2036
2037 static int setup_exec_directory(
2038 const ExecContext *context,
2039 const ExecParameters *params,
2040 uid_t uid,
2041 gid_t gid,
2042 ExecDirectoryType type,
2043 int *exit_status) {
2044
2045 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2046 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2047 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2048 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2049 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2050 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2051 };
2052 char **rt;
2053 int r;
2054
2055 assert(context);
2056 assert(params);
2057 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2058 assert(exit_status);
2059
2060 if (!params->prefix[type])
2061 return 0;
2062
2063 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2064 if (!uid_is_valid(uid))
2065 uid = 0;
2066 if (!gid_is_valid(gid))
2067 gid = 0;
2068 }
2069
2070 STRV_FOREACH(rt, context->directories[type].paths) {
2071 _cleanup_free_ char *p = NULL, *pp = NULL;
2072
2073 p = strjoin(params->prefix[type], "/", *rt);
2074 if (!p) {
2075 r = -ENOMEM;
2076 goto fail;
2077 }
2078
2079 r = mkdir_parents_label(p, 0755);
2080 if (r < 0)
2081 goto fail;
2082
2083 if (context->dynamic_user &&
2084 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2085 _cleanup_free_ char *private_root = NULL;
2086
2087 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2088 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2089 * whose UID is later on reused. To lock this down we use the same trick used by container
2090 * managers to prohibit host users to get access to files of the same UID in containers: we
2091 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2092 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2093 * to make this directory permeable for the service itself.
2094 *
2095 * Specifically: for a service which wants a special directory "foo/" we first create a
2096 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2097 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2098 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2099 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2100 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2101 * disabling the access boundary for the service and making sure it only gets access to the
2102 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2103 *
2104 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2105 * owned by the service itself.
2106 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2107 * files or sockets with other services. */
2108
2109 private_root = strjoin(params->prefix[type], "/private");
2110 if (!private_root) {
2111 r = -ENOMEM;
2112 goto fail;
2113 }
2114
2115 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2116 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2117 if (r < 0)
2118 goto fail;
2119
2120 pp = strjoin(private_root, "/", *rt);
2121 if (!pp) {
2122 r = -ENOMEM;
2123 goto fail;
2124 }
2125
2126 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2127 r = mkdir_parents_label(pp, 0755);
2128 if (r < 0)
2129 goto fail;
2130
2131 if (is_dir(p, false) > 0 &&
2132 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2133
2134 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2135 * it over. Most likely the service has been upgraded from one that didn't use
2136 * DynamicUser=1, to one that does. */
2137
2138 if (rename(p, pp) < 0) {
2139 r = -errno;
2140 goto fail;
2141 }
2142 } else {
2143 /* Otherwise, create the actual directory for the service */
2144
2145 r = mkdir_label(pp, context->directories[type].mode);
2146 if (r < 0 && r != -EEXIST)
2147 goto fail;
2148 }
2149
2150 /* And link it up from the original place */
2151 r = symlink_idempotent(pp, p, true);
2152 if (r < 0)
2153 goto fail;
2154
2155 /* Lock down the access mode */
2156 if (chmod(pp, context->directories[type].mode) < 0) {
2157 r = -errno;
2158 goto fail;
2159 }
2160 } else {
2161 r = mkdir_label(p, context->directories[type].mode);
2162 if (r < 0 && r != -EEXIST)
2163 goto fail;
2164 if (r == -EEXIST && !context->dynamic_user)
2165 continue;
2166 }
2167
2168 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2169 * a service, and shall not be writable. */
2170 if (type == EXEC_DIRECTORY_CONFIGURATION)
2171 continue;
2172
2173 /* Then, change the ownership of the whole tree, if necessary */
2174 r = path_chown_recursive(pp ?: p, uid, gid);
2175 if (r < 0)
2176 goto fail;
2177 }
2178
2179 return 0;
2180
2181 fail:
2182 *exit_status = exit_status_table[type];
2183 return r;
2184 }
2185
2186 #if ENABLE_SMACK
2187 static int setup_smack(
2188 const ExecContext *context,
2189 const ExecCommand *command) {
2190
2191 int r;
2192
2193 assert(context);
2194 assert(command);
2195
2196 if (context->smack_process_label) {
2197 r = mac_smack_apply_pid(0, context->smack_process_label);
2198 if (r < 0)
2199 return r;
2200 }
2201 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2202 else {
2203 _cleanup_free_ char *exec_label = NULL;
2204
2205 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2206 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2207 return r;
2208
2209 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2210 if (r < 0)
2211 return r;
2212 }
2213 #endif
2214
2215 return 0;
2216 }
2217 #endif
2218
2219 static int compile_bind_mounts(
2220 const ExecContext *context,
2221 const ExecParameters *params,
2222 BindMount **ret_bind_mounts,
2223 size_t *ret_n_bind_mounts,
2224 char ***ret_empty_directories) {
2225
2226 _cleanup_strv_free_ char **empty_directories = NULL;
2227 BindMount *bind_mounts;
2228 size_t n, h = 0, i;
2229 ExecDirectoryType t;
2230 int r;
2231
2232 assert(context);
2233 assert(params);
2234 assert(ret_bind_mounts);
2235 assert(ret_n_bind_mounts);
2236 assert(ret_empty_directories);
2237
2238 n = context->n_bind_mounts;
2239 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2240 if (!params->prefix[t])
2241 continue;
2242
2243 n += strv_length(context->directories[t].paths);
2244 }
2245
2246 if (n <= 0) {
2247 *ret_bind_mounts = NULL;
2248 *ret_n_bind_mounts = 0;
2249 *ret_empty_directories = NULL;
2250 return 0;
2251 }
2252
2253 bind_mounts = new(BindMount, n);
2254 if (!bind_mounts)
2255 return -ENOMEM;
2256
2257 for (i = 0; i < context->n_bind_mounts; i++) {
2258 BindMount *item = context->bind_mounts + i;
2259 char *s, *d;
2260
2261 s = strdup(item->source);
2262 if (!s) {
2263 r = -ENOMEM;
2264 goto finish;
2265 }
2266
2267 d = strdup(item->destination);
2268 if (!d) {
2269 free(s);
2270 r = -ENOMEM;
2271 goto finish;
2272 }
2273
2274 bind_mounts[h++] = (BindMount) {
2275 .source = s,
2276 .destination = d,
2277 .read_only = item->read_only,
2278 .recursive = item->recursive,
2279 .ignore_enoent = item->ignore_enoent,
2280 };
2281 }
2282
2283 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2284 char **suffix;
2285
2286 if (!params->prefix[t])
2287 continue;
2288
2289 if (strv_isempty(context->directories[t].paths))
2290 continue;
2291
2292 if (context->dynamic_user &&
2293 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2294 !(context->root_directory || context->root_image)) {
2295 char *private_root;
2296
2297 /* So this is for a dynamic user, and we need to make sure the process can access its own
2298 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2299 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2300
2301 private_root = strjoin(params->prefix[t], "/private");
2302 if (!private_root) {
2303 r = -ENOMEM;
2304 goto finish;
2305 }
2306
2307 r = strv_consume(&empty_directories, private_root);
2308 if (r < 0)
2309 goto finish;
2310 }
2311
2312 STRV_FOREACH(suffix, context->directories[t].paths) {
2313 char *s, *d;
2314
2315 if (context->dynamic_user &&
2316 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2317 s = strjoin(params->prefix[t], "/private/", *suffix);
2318 else
2319 s = strjoin(params->prefix[t], "/", *suffix);
2320 if (!s) {
2321 r = -ENOMEM;
2322 goto finish;
2323 }
2324
2325 if (context->dynamic_user &&
2326 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2327 (context->root_directory || context->root_image))
2328 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2329 * directory is not created on the root directory. So, let's bind-mount the directory
2330 * on the 'non-private' place. */
2331 d = strjoin(params->prefix[t], "/", *suffix);
2332 else
2333 d = strdup(s);
2334 if (!d) {
2335 free(s);
2336 r = -ENOMEM;
2337 goto finish;
2338 }
2339
2340 bind_mounts[h++] = (BindMount) {
2341 .source = s,
2342 .destination = d,
2343 .read_only = false,
2344 .recursive = true,
2345 .ignore_enoent = false,
2346 };
2347 }
2348 }
2349
2350 assert(h == n);
2351
2352 *ret_bind_mounts = bind_mounts;
2353 *ret_n_bind_mounts = n;
2354 *ret_empty_directories = TAKE_PTR(empty_directories);
2355
2356 return (int) n;
2357
2358 finish:
2359 bind_mount_free_many(bind_mounts, h);
2360 return r;
2361 }
2362
2363 static int apply_mount_namespace(
2364 const Unit *u,
2365 const ExecCommand *command,
2366 const ExecContext *context,
2367 const ExecParameters *params,
2368 const ExecRuntime *runtime) {
2369
2370 _cleanup_strv_free_ char **empty_directories = NULL;
2371 char *tmp = NULL, *var = NULL;
2372 const char *root_dir = NULL, *root_image = NULL;
2373 NamespaceInfo ns_info;
2374 bool needs_sandboxing;
2375 BindMount *bind_mounts = NULL;
2376 size_t n_bind_mounts = 0;
2377 int r;
2378
2379 assert(context);
2380
2381 /* The runtime struct only contains the parent of the private /tmp,
2382 * which is non-accessible to world users. Inside of it there's a /tmp
2383 * that is sticky, and that's the one we want to use here. */
2384
2385 if (context->private_tmp && runtime) {
2386 if (runtime->tmp_dir)
2387 tmp = strjoina(runtime->tmp_dir, "/tmp");
2388 if (runtime->var_tmp_dir)
2389 var = strjoina(runtime->var_tmp_dir, "/tmp");
2390 }
2391
2392 if (params->flags & EXEC_APPLY_CHROOT) {
2393 root_image = context->root_image;
2394
2395 if (!root_image)
2396 root_dir = context->root_directory;
2397 }
2398
2399 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2400 if (r < 0)
2401 return r;
2402
2403 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2404 if (needs_sandboxing)
2405 ns_info = (NamespaceInfo) {
2406 .ignore_protect_paths = false,
2407 .private_dev = context->private_devices,
2408 .protect_control_groups = context->protect_control_groups,
2409 .protect_kernel_tunables = context->protect_kernel_tunables,
2410 .protect_kernel_modules = context->protect_kernel_modules,
2411 .mount_apivfs = context->mount_apivfs,
2412 .private_mounts = context->private_mounts,
2413 };
2414 else if (!context->dynamic_user && root_dir)
2415 /*
2416 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2417 * sandbox info, otherwise enforce it, don't ignore protected paths and
2418 * fail if we are enable to apply the sandbox inside the mount namespace.
2419 */
2420 ns_info = (NamespaceInfo) {
2421 .ignore_protect_paths = true,
2422 };
2423 else
2424 ns_info = (NamespaceInfo) {};
2425
2426 r = setup_namespace(root_dir, root_image,
2427 &ns_info, context->read_write_paths,
2428 needs_sandboxing ? context->read_only_paths : NULL,
2429 needs_sandboxing ? context->inaccessible_paths : NULL,
2430 empty_directories,
2431 bind_mounts,
2432 n_bind_mounts,
2433 context->temporary_filesystems,
2434 context->n_temporary_filesystems,
2435 tmp,
2436 var,
2437 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2438 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2439 context->mount_flags,
2440 DISSECT_IMAGE_DISCARD_ON_LOOP);
2441
2442 bind_mount_free_many(bind_mounts, n_bind_mounts);
2443
2444 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2445 * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
2446 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2447 * completely different execution environment. */
2448 if (r == -ENOANO) {
2449 if (n_bind_mounts == 0 &&
2450 context->n_temporary_filesystems == 0 &&
2451 !root_dir && !root_image &&
2452 !context->dynamic_user) {
2453 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2454 return 0;
2455 }
2456
2457 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2458 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2459 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2460
2461 return -EOPNOTSUPP;
2462 }
2463
2464 return r;
2465 }
2466
2467 static int apply_working_directory(
2468 const ExecContext *context,
2469 const ExecParameters *params,
2470 const char *home,
2471 const bool needs_mount_ns,
2472 int *exit_status) {
2473
2474 const char *d, *wd;
2475
2476 assert(context);
2477 assert(exit_status);
2478
2479 if (context->working_directory_home) {
2480
2481 if (!home) {
2482 *exit_status = EXIT_CHDIR;
2483 return -ENXIO;
2484 }
2485
2486 wd = home;
2487
2488 } else if (context->working_directory)
2489 wd = context->working_directory;
2490 else
2491 wd = "/";
2492
2493 if (params->flags & EXEC_APPLY_CHROOT) {
2494 if (!needs_mount_ns && context->root_directory)
2495 if (chroot(context->root_directory) < 0) {
2496 *exit_status = EXIT_CHROOT;
2497 return -errno;
2498 }
2499
2500 d = wd;
2501 } else
2502 d = prefix_roota(context->root_directory, wd);
2503
2504 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2505 *exit_status = EXIT_CHDIR;
2506 return -errno;
2507 }
2508
2509 return 0;
2510 }
2511
2512 static int setup_keyring(
2513 const Unit *u,
2514 const ExecContext *context,
2515 const ExecParameters *p,
2516 uid_t uid, gid_t gid) {
2517
2518 key_serial_t keyring;
2519 int r = 0;
2520 uid_t saved_uid;
2521 gid_t saved_gid;
2522
2523 assert(u);
2524 assert(context);
2525 assert(p);
2526
2527 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2528 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2529 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2530 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2531 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2532 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2533
2534 if (!(p->flags & EXEC_NEW_KEYRING))
2535 return 0;
2536
2537 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2538 return 0;
2539
2540 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2541 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2542 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2543 * & group is just as nasty as acquiring a reference to the user keyring. */
2544
2545 saved_uid = getuid();
2546 saved_gid = getgid();
2547
2548 if (gid_is_valid(gid) && gid != saved_gid) {
2549 if (setregid(gid, -1) < 0)
2550 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2551 }
2552
2553 if (uid_is_valid(uid) && uid != saved_uid) {
2554 if (setreuid(uid, -1) < 0) {
2555 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2556 goto out;
2557 }
2558 }
2559
2560 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2561 if (keyring == -1) {
2562 if (errno == ENOSYS)
2563 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2564 else if (IN_SET(errno, EACCES, EPERM))
2565 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2566 else if (errno == EDQUOT)
2567 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2568 else
2569 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2570
2571 goto out;
2572 }
2573
2574 /* When requested link the user keyring into the session keyring. */
2575 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2576
2577 if (keyctl(KEYCTL_LINK,
2578 KEY_SPEC_USER_KEYRING,
2579 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2580 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2581 goto out;
2582 }
2583 }
2584
2585 /* Restore uid/gid back */
2586 if (uid_is_valid(uid) && uid != saved_uid) {
2587 if (setreuid(saved_uid, -1) < 0) {
2588 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2589 goto out;
2590 }
2591 }
2592
2593 if (gid_is_valid(gid) && gid != saved_gid) {
2594 if (setregid(saved_gid, -1) < 0)
2595 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2596 }
2597
2598 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2599 if (!sd_id128_is_null(u->invocation_id)) {
2600 key_serial_t key;
2601
2602 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2603 if (key == -1)
2604 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2605 else {
2606 if (keyctl(KEYCTL_SETPERM, key,
2607 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2608 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2609 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2610 }
2611 }
2612
2613 out:
2614 /* Revert back uid & gid for the the last time, and exit */
2615 /* no extra logging, as only the first already reported error matters */
2616 if (getuid() != saved_uid)
2617 (void) setreuid(saved_uid, -1);
2618
2619 if (getgid() != saved_gid)
2620 (void) setregid(saved_gid, -1);
2621
2622 return r;
2623 }
2624
2625 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2626 assert(array);
2627 assert(n);
2628
2629 if (!pair)
2630 return;
2631
2632 if (pair[0] >= 0)
2633 array[(*n)++] = pair[0];
2634 if (pair[1] >= 0)
2635 array[(*n)++] = pair[1];
2636 }
2637
2638 static int close_remaining_fds(
2639 const ExecParameters *params,
2640 const ExecRuntime *runtime,
2641 const DynamicCreds *dcreds,
2642 int user_lookup_fd,
2643 int socket_fd,
2644 int exec_fd,
2645 int *fds, size_t n_fds) {
2646
2647 size_t n_dont_close = 0;
2648 int dont_close[n_fds + 12];
2649
2650 assert(params);
2651
2652 if (params->stdin_fd >= 0)
2653 dont_close[n_dont_close++] = params->stdin_fd;
2654 if (params->stdout_fd >= 0)
2655 dont_close[n_dont_close++] = params->stdout_fd;
2656 if (params->stderr_fd >= 0)
2657 dont_close[n_dont_close++] = params->stderr_fd;
2658
2659 if (socket_fd >= 0)
2660 dont_close[n_dont_close++] = socket_fd;
2661 if (exec_fd >= 0)
2662 dont_close[n_dont_close++] = exec_fd;
2663 if (n_fds > 0) {
2664 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2665 n_dont_close += n_fds;
2666 }
2667
2668 if (runtime)
2669 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2670
2671 if (dcreds) {
2672 if (dcreds->user)
2673 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2674 if (dcreds->group)
2675 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2676 }
2677
2678 if (user_lookup_fd >= 0)
2679 dont_close[n_dont_close++] = user_lookup_fd;
2680
2681 return close_all_fds(dont_close, n_dont_close);
2682 }
2683
2684 static int send_user_lookup(
2685 Unit *unit,
2686 int user_lookup_fd,
2687 uid_t uid,
2688 gid_t gid) {
2689
2690 assert(unit);
2691
2692 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2693 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2694 * specified. */
2695
2696 if (user_lookup_fd < 0)
2697 return 0;
2698
2699 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2700 return 0;
2701
2702 if (writev(user_lookup_fd,
2703 (struct iovec[]) {
2704 IOVEC_INIT(&uid, sizeof(uid)),
2705 IOVEC_INIT(&gid, sizeof(gid)),
2706 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2707 return -errno;
2708
2709 return 0;
2710 }
2711
2712 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2713 int r;
2714
2715 assert(c);
2716 assert(home);
2717 assert(buf);
2718
2719 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2720
2721 if (*home)
2722 return 0;
2723
2724 if (!c->working_directory_home)
2725 return 0;
2726
2727 if (uid == 0) {
2728 /* Hardcode /root as home directory for UID 0 */
2729 *home = "/root";
2730 return 1;
2731 }
2732
2733 r = get_home_dir(buf);
2734 if (r < 0)
2735 return r;
2736
2737 *home = *buf;
2738 return 1;
2739 }
2740
2741 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2742 _cleanup_strv_free_ char ** list = NULL;
2743 ExecDirectoryType t;
2744 int r;
2745
2746 assert(c);
2747 assert(p);
2748 assert(ret);
2749
2750 assert(c->dynamic_user);
2751
2752 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2753 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2754 * directories. */
2755
2756 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2757 char **i;
2758
2759 if (t == EXEC_DIRECTORY_CONFIGURATION)
2760 continue;
2761
2762 if (!p->prefix[t])
2763 continue;
2764
2765 STRV_FOREACH(i, c->directories[t].paths) {
2766 char *e;
2767
2768 if (t == EXEC_DIRECTORY_RUNTIME)
2769 e = strjoin(p->prefix[t], "/", *i);
2770 else
2771 e = strjoin(p->prefix[t], "/private/", *i);
2772 if (!e)
2773 return -ENOMEM;
2774
2775 r = strv_consume(&list, e);
2776 if (r < 0)
2777 return r;
2778 }
2779 }
2780
2781 *ret = TAKE_PTR(list);
2782
2783 return 0;
2784 }
2785
2786 static char *exec_command_line(char **argv);
2787
2788 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2789 bool using_subcgroup;
2790 char *p;
2791
2792 assert(params);
2793 assert(ret);
2794
2795 if (!params->cgroup_path)
2796 return -EINVAL;
2797
2798 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2799 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2800 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2801 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2802 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2803 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2804 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2805 * flag, which is only passed for the former statements, not for the latter. */
2806
2807 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2808 if (using_subcgroup)
2809 p = strjoin(params->cgroup_path, "/.control");
2810 else
2811 p = strdup(params->cgroup_path);
2812 if (!p)
2813 return -ENOMEM;
2814
2815 *ret = p;
2816 return using_subcgroup;
2817 }
2818
2819 static int exec_child(
2820 Unit *unit,
2821 const ExecCommand *command,
2822 const ExecContext *context,
2823 const ExecParameters *params,
2824 ExecRuntime *runtime,
2825 DynamicCreds *dcreds,
2826 int socket_fd,
2827 int named_iofds[3],
2828 int *fds,
2829 size_t n_socket_fds,
2830 size_t n_storage_fds,
2831 char **files_env,
2832 int user_lookup_fd,
2833 int *exit_status) {
2834
2835 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2836 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2837 _cleanup_free_ gid_t *supplementary_gids = NULL;
2838 const char *username = NULL, *groupname = NULL;
2839 _cleanup_free_ char *home_buffer = NULL;
2840 const char *home = NULL, *shell = NULL;
2841 dev_t journal_stream_dev = 0;
2842 ino_t journal_stream_ino = 0;
2843 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2844 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2845 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2846 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2847 #if HAVE_SELINUX
2848 _cleanup_free_ char *mac_selinux_context_net = NULL;
2849 bool use_selinux = false;
2850 #endif
2851 #if ENABLE_SMACK
2852 bool use_smack = false;
2853 #endif
2854 #if HAVE_APPARMOR
2855 bool use_apparmor = false;
2856 #endif
2857 uid_t uid = UID_INVALID;
2858 gid_t gid = GID_INVALID;
2859 size_t n_fds;
2860 ExecDirectoryType dt;
2861 int secure_bits;
2862
2863 assert(unit);
2864 assert(command);
2865 assert(context);
2866 assert(params);
2867 assert(exit_status);
2868
2869 rename_process_from_path(command->path);
2870
2871 /* We reset exactly these signals, since they are the
2872 * only ones we set to SIG_IGN in the main daemon. All
2873 * others we leave untouched because we set them to
2874 * SIG_DFL or a valid handler initially, both of which
2875 * will be demoted to SIG_DFL. */
2876 (void) default_signals(SIGNALS_CRASH_HANDLER,
2877 SIGNALS_IGNORE, -1);
2878
2879 if (context->ignore_sigpipe)
2880 (void) ignore_signals(SIGPIPE, -1);
2881
2882 r = reset_signal_mask();
2883 if (r < 0) {
2884 *exit_status = EXIT_SIGNAL_MASK;
2885 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2886 }
2887
2888 if (params->idle_pipe)
2889 do_idle_pipe_dance(params->idle_pipe);
2890
2891 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2892 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2893 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2894 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2895
2896 log_forget_fds();
2897 log_set_open_when_needed(true);
2898
2899 /* In case anything used libc syslog(), close this here, too */
2900 closelog();
2901
2902 n_fds = n_socket_fds + n_storage_fds;
2903 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2904 if (r < 0) {
2905 *exit_status = EXIT_FDS;
2906 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2907 }
2908
2909 if (!context->same_pgrp)
2910 if (setsid() < 0) {
2911 *exit_status = EXIT_SETSID;
2912 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2913 }
2914
2915 exec_context_tty_reset(context, params);
2916
2917 if (unit_shall_confirm_spawn(unit)) {
2918 const char *vc = params->confirm_spawn;
2919 _cleanup_free_ char *cmdline = NULL;
2920
2921 cmdline = exec_command_line(command->argv);
2922 if (!cmdline) {
2923 *exit_status = EXIT_MEMORY;
2924 return log_oom();
2925 }
2926
2927 r = ask_for_confirmation(vc, unit, cmdline);
2928 if (r != CONFIRM_EXECUTE) {
2929 if (r == CONFIRM_PRETEND_SUCCESS) {
2930 *exit_status = EXIT_SUCCESS;
2931 return 0;
2932 }
2933 *exit_status = EXIT_CONFIRM;
2934 log_unit_error(unit, "Execution cancelled by the user");
2935 return -ECANCELED;
2936 }
2937 }
2938
2939 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2940 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2941 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2942 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2943 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2944 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
2945 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
2946 *exit_status = EXIT_MEMORY;
2947 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2948 }
2949
2950 if (context->dynamic_user && dcreds) {
2951 _cleanup_strv_free_ char **suggested_paths = NULL;
2952
2953 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2954 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2955 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2956 *exit_status = EXIT_USER;
2957 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2958 }
2959
2960 r = compile_suggested_paths(context, params, &suggested_paths);
2961 if (r < 0) {
2962 *exit_status = EXIT_MEMORY;
2963 return log_oom();
2964 }
2965
2966 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2967 if (r < 0) {
2968 *exit_status = EXIT_USER;
2969 if (r == -EILSEQ) {
2970 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2971 return -EOPNOTSUPP;
2972 }
2973 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2974 }
2975
2976 if (!uid_is_valid(uid)) {
2977 *exit_status = EXIT_USER;
2978 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2979 return -ESRCH;
2980 }
2981
2982 if (!gid_is_valid(gid)) {
2983 *exit_status = EXIT_USER;
2984 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2985 return -ESRCH;
2986 }
2987
2988 if (dcreds->user)
2989 username = dcreds->user->name;
2990
2991 } else {
2992 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2993 if (r < 0) {
2994 *exit_status = EXIT_USER;
2995 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2996 }
2997
2998 r = get_fixed_group(context, &groupname, &gid);
2999 if (r < 0) {
3000 *exit_status = EXIT_GROUP;
3001 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3002 }
3003 }
3004
3005 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3006 r = get_supplementary_groups(context, username, groupname, gid,
3007 &supplementary_gids, &ngids);
3008 if (r < 0) {
3009 *exit_status = EXIT_GROUP;
3010 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3011 }
3012
3013 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3014 if (r < 0) {
3015 *exit_status = EXIT_USER;
3016 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3017 }
3018
3019 user_lookup_fd = safe_close(user_lookup_fd);
3020
3021 r = acquire_home(context, uid, &home, &home_buffer);
3022 if (r < 0) {
3023 *exit_status = EXIT_CHDIR;
3024 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3025 }
3026
3027 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3028 * must sure to drop O_NONBLOCK */
3029 if (socket_fd >= 0)
3030 (void) fd_nonblock(socket_fd, false);
3031
3032 r = setup_input(context, params, socket_fd, named_iofds);
3033 if (r < 0) {
3034 *exit_status = EXIT_STDIN;
3035 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3036 }
3037
3038 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3039 if (r < 0) {
3040 *exit_status = EXIT_STDOUT;
3041 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3042 }
3043
3044 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3045 if (r < 0) {
3046 *exit_status = EXIT_STDERR;
3047 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3048 }
3049
3050 if (params->cgroup_path) {
3051 _cleanup_free_ char *p = NULL;
3052
3053 r = exec_parameters_get_cgroup_path(params, &p);
3054 if (r < 0) {
3055 *exit_status = EXIT_CGROUP;
3056 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3057 }
3058
3059 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3060 if (r < 0) {
3061 *exit_status = EXIT_CGROUP;
3062 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3063 }
3064 }
3065
3066 if (context->oom_score_adjust_set) {
3067 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3068 * prohibit write access to this file, and we shouldn't trip up over that. */
3069 r = set_oom_score_adjust(context->oom_score_adjust);
3070 if (IN_SET(r, -EPERM, -EACCES))
3071 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3072 else if (r < 0) {
3073 *exit_status = EXIT_OOM_ADJUST;
3074 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3075 }
3076 }
3077
3078 if (context->nice_set)
3079 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3080 *exit_status = EXIT_NICE;
3081 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3082 }
3083
3084 if (context->cpu_sched_set) {
3085 struct sched_param param = {
3086 .sched_priority = context->cpu_sched_priority,
3087 };
3088
3089 r = sched_setscheduler(0,
3090 context->cpu_sched_policy |
3091 (context->cpu_sched_reset_on_fork ?
3092 SCHED_RESET_ON_FORK : 0),
3093 &param);
3094 if (r < 0) {
3095 *exit_status = EXIT_SETSCHEDULER;
3096 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3097 }
3098 }
3099
3100 if (context->cpuset)
3101 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
3102 *exit_status = EXIT_CPUAFFINITY;
3103 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3104 }
3105
3106 if (context->ioprio_set)
3107 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3108 *exit_status = EXIT_IOPRIO;
3109 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3110 }
3111
3112 if (context->timer_slack_nsec != NSEC_INFINITY)
3113 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3114 *exit_status = EXIT_TIMERSLACK;
3115 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3116 }
3117
3118 if (context->personality != PERSONALITY_INVALID) {
3119 r = safe_personality(context->personality);
3120 if (r < 0) {
3121 *exit_status = EXIT_PERSONALITY;
3122 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3123 }
3124 }
3125
3126 if (context->utmp_id)
3127 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3128 context->tty_path,
3129 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3130 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3131 USER_PROCESS,
3132 username);
3133
3134 if (context->user) {
3135 r = chown_terminal(STDIN_FILENO, uid);
3136 if (r < 0) {
3137 *exit_status = EXIT_STDIN;
3138 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3139 }
3140 }
3141
3142 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3143 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3144 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3145 * touch a single hierarchy too. */
3146 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3147 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3148 if (r < 0) {
3149 *exit_status = EXIT_CGROUP;
3150 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3151 }
3152 }
3153
3154 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3155 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3156 if (r < 0)
3157 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3158 }
3159
3160 r = build_environment(
3161 unit,
3162 context,
3163 params,
3164 n_fds,
3165 home,
3166 username,
3167 shell,
3168 journal_stream_dev,
3169 journal_stream_ino,
3170 &our_env);
3171 if (r < 0) {
3172 *exit_status = EXIT_MEMORY;
3173 return log_oom();
3174 }
3175
3176 r = build_pass_environment(context, &pass_env);
3177 if (r < 0) {
3178 *exit_status = EXIT_MEMORY;
3179 return log_oom();
3180 }
3181
3182 accum_env = strv_env_merge(5,
3183 params->environment,
3184 our_env,
3185 pass_env,
3186 context->environment,
3187 files_env,
3188 NULL);
3189 if (!accum_env) {
3190 *exit_status = EXIT_MEMORY;
3191 return log_oom();
3192 }
3193 accum_env = strv_env_clean(accum_env);
3194
3195 (void) umask(context->umask);
3196
3197 r = setup_keyring(unit, context, params, uid, gid);
3198 if (r < 0) {
3199 *exit_status = EXIT_KEYRING;
3200 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3201 }
3202
3203 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3204 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3205
3206 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3207 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3208
3209 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3210 if (needs_ambient_hack)
3211 needs_setuid = false;
3212 else
3213 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3214
3215 if (needs_sandboxing) {
3216 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3217 * present. The actual MAC context application will happen later, as late as possible, to avoid
3218 * impacting our own code paths. */
3219
3220 #if HAVE_SELINUX
3221 use_selinux = mac_selinux_use();
3222 #endif
3223 #if ENABLE_SMACK
3224 use_smack = mac_smack_use();
3225 #endif
3226 #if HAVE_APPARMOR
3227 use_apparmor = mac_apparmor_use();
3228 #endif
3229 }
3230
3231 if (needs_setuid) {
3232 if (context->pam_name && username) {
3233 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3234 if (r < 0) {
3235 *exit_status = EXIT_PAM;
3236 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3237 }
3238 }
3239 }
3240
3241 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3242 if (ns_type_supported(NAMESPACE_NET)) {
3243 r = setup_netns(runtime->netns_storage_socket);
3244 if (r < 0) {
3245 *exit_status = EXIT_NETWORK;
3246 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3247 }
3248 } else
3249 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3250 }
3251
3252 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3253 if (needs_mount_namespace) {
3254 r = apply_mount_namespace(unit, command, context, params, runtime);
3255 if (r < 0) {
3256 *exit_status = EXIT_NAMESPACE;
3257 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3258 }
3259 }
3260
3261 /* Drop groups as early as possbile */
3262 if (needs_setuid) {
3263 r = enforce_groups(gid, supplementary_gids, ngids);
3264 if (r < 0) {
3265 *exit_status = EXIT_GROUP;
3266 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3267 }
3268 }
3269
3270 if (needs_sandboxing) {
3271 #if HAVE_SELINUX
3272 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3273 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3274 if (r < 0) {
3275 *exit_status = EXIT_SELINUX_CONTEXT;
3276 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3277 }
3278 }
3279 #endif
3280
3281 if (context->private_users) {
3282 r = setup_private_users(uid, gid);
3283 if (r < 0) {
3284 *exit_status = EXIT_USER;
3285 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3286 }
3287 }
3288 }
3289
3290 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3291 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3292 * however if we have it as we want to keep it open until the final execve(). */
3293
3294 if (params->exec_fd >= 0) {
3295 exec_fd = params->exec_fd;
3296
3297 if (exec_fd < 3 + (int) n_fds) {
3298 int moved_fd;
3299
3300 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3301 * process we are about to execute. */
3302
3303 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3304 if (moved_fd < 0) {
3305 *exit_status = EXIT_FDS;
3306 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3307 }
3308
3309 safe_close(exec_fd);
3310 exec_fd = moved_fd;
3311 } else {
3312 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3313 r = fd_cloexec(exec_fd, true);
3314 if (r < 0) {
3315 *exit_status = EXIT_FDS;
3316 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3317 }
3318 }
3319
3320 fds_with_exec_fd = newa(int, n_fds + 1);
3321 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3322 fds_with_exec_fd[n_fds] = exec_fd;
3323 n_fds_with_exec_fd = n_fds + 1;
3324 } else {
3325 fds_with_exec_fd = fds;
3326 n_fds_with_exec_fd = n_fds;
3327 }
3328
3329 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3330 if (r >= 0)
3331 r = shift_fds(fds, n_fds);
3332 if (r >= 0)
3333 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3334 if (r < 0) {
3335 *exit_status = EXIT_FDS;
3336 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3337 }
3338
3339 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3340 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3341 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3342 * came this far. */
3343
3344 secure_bits = context->secure_bits;
3345
3346 if (needs_sandboxing) {
3347 uint64_t bset;
3348 int which_failed;
3349
3350 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3351 if (r < 0) {
3352 *exit_status = EXIT_LIMITS;
3353 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3354 }
3355
3356 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3357 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3358 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3359 *exit_status = EXIT_LIMITS;
3360 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3361 }
3362 }
3363
3364 #if ENABLE_SMACK
3365 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3366 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3367 if (use_smack) {
3368 r = setup_smack(context, command);
3369 if (r < 0) {
3370 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3371 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3372 }
3373 }
3374 #endif
3375
3376 bset = context->capability_bounding_set;
3377 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3378 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3379 * instead of us doing that */
3380 if (needs_ambient_hack)
3381 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3382 (UINT64_C(1) << CAP_SETUID) |
3383 (UINT64_C(1) << CAP_SETGID);
3384
3385 if (!cap_test_all(bset)) {
3386 r = capability_bounding_set_drop(bset, false);
3387 if (r < 0) {
3388 *exit_status = EXIT_CAPABILITIES;
3389 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3390 }
3391 }
3392
3393 /* This is done before enforce_user, but ambient set
3394 * does not survive over setresuid() if keep_caps is not set. */
3395 if (!needs_ambient_hack &&
3396 context->capability_ambient_set != 0) {
3397 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3398 if (r < 0) {
3399 *exit_status = EXIT_CAPABILITIES;
3400 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3401 }
3402 }
3403 }
3404
3405 if (needs_setuid) {
3406 if (context->user) {
3407 r = enforce_user(context, uid);
3408 if (r < 0) {
3409 *exit_status = EXIT_USER;
3410 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3411 }
3412
3413 if (!needs_ambient_hack &&
3414 context->capability_ambient_set != 0) {
3415
3416 /* Fix the ambient capabilities after user change. */
3417 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3418 if (r < 0) {
3419 *exit_status = EXIT_CAPABILITIES;
3420 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3421 }
3422
3423 /* If we were asked to change user and ambient capabilities
3424 * were requested, we had to add keep-caps to the securebits
3425 * so that we would maintain the inherited capability set
3426 * through the setresuid(). Make sure that the bit is added
3427 * also to the context secure_bits so that we don't try to
3428 * drop the bit away next. */
3429
3430 secure_bits |= 1<<SECURE_KEEP_CAPS;
3431 }
3432 }
3433 }
3434
3435 /* Apply working directory here, because the working directory might be on NFS and only the user running
3436 * this service might have the correct privilege to change to the working directory */
3437 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3438 if (r < 0)
3439 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3440
3441 if (needs_sandboxing) {
3442 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3443 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3444 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3445 * are restricted. */
3446
3447 #if HAVE_SELINUX
3448 if (use_selinux) {
3449 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3450
3451 if (exec_context) {
3452 r = setexeccon(exec_context);
3453 if (r < 0) {
3454 *exit_status = EXIT_SELINUX_CONTEXT;
3455 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3456 }
3457 }
3458 }
3459 #endif
3460
3461 #if HAVE_APPARMOR
3462 if (use_apparmor && context->apparmor_profile) {
3463 r = aa_change_onexec(context->apparmor_profile);
3464 if (r < 0 && !context->apparmor_profile_ignore) {
3465 *exit_status = EXIT_APPARMOR_PROFILE;
3466 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3467 }
3468 }
3469 #endif
3470
3471 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3472 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3473 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3474 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3475 *exit_status = EXIT_SECUREBITS;
3476 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3477 }
3478
3479 if (context_has_no_new_privileges(context))
3480 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3481 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3482 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3483 }
3484
3485 #if HAVE_SECCOMP
3486 r = apply_address_families(unit, context);
3487 if (r < 0) {
3488 *exit_status = EXIT_ADDRESS_FAMILIES;
3489 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3490 }
3491
3492 r = apply_memory_deny_write_execute(unit, context);
3493 if (r < 0) {
3494 *exit_status = EXIT_SECCOMP;
3495 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3496 }
3497
3498 r = apply_restrict_realtime(unit, context);
3499 if (r < 0) {
3500 *exit_status = EXIT_SECCOMP;
3501 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3502 }
3503
3504 r = apply_restrict_namespaces(unit, context);
3505 if (r < 0) {
3506 *exit_status = EXIT_SECCOMP;
3507 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3508 }
3509
3510 r = apply_protect_sysctl(unit, context);
3511 if (r < 0) {
3512 *exit_status = EXIT_SECCOMP;
3513 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3514 }
3515
3516 r = apply_protect_kernel_modules(unit, context);
3517 if (r < 0) {
3518 *exit_status = EXIT_SECCOMP;
3519 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3520 }
3521
3522 r = apply_private_devices(unit, context);
3523 if (r < 0) {
3524 *exit_status = EXIT_SECCOMP;
3525 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3526 }
3527
3528 r = apply_syscall_archs(unit, context);
3529 if (r < 0) {
3530 *exit_status = EXIT_SECCOMP;
3531 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3532 }
3533
3534 r = apply_lock_personality(unit, context);
3535 if (r < 0) {
3536 *exit_status = EXIT_SECCOMP;
3537 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3538 }
3539
3540 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3541 * by the filter as little as possible. */
3542 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3543 if (r < 0) {
3544 *exit_status = EXIT_SECCOMP;
3545 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3546 }
3547 #endif
3548 }
3549
3550 if (!strv_isempty(context->unset_environment)) {
3551 char **ee = NULL;
3552
3553 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3554 if (!ee) {
3555 *exit_status = EXIT_MEMORY;
3556 return log_oom();
3557 }
3558
3559 strv_free_and_replace(accum_env, ee);
3560 }
3561
3562 final_argv = replace_env_argv(command->argv, accum_env);
3563 if (!final_argv) {
3564 *exit_status = EXIT_MEMORY;
3565 return log_oom();
3566 }
3567
3568 if (DEBUG_LOGGING) {
3569 _cleanup_free_ char *line;
3570
3571 line = exec_command_line(final_argv);
3572 if (line)
3573 log_struct(LOG_DEBUG,
3574 "EXECUTABLE=%s", command->path,
3575 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3576 LOG_UNIT_ID(unit),
3577 LOG_UNIT_INVOCATION_ID(unit));
3578 }
3579
3580 if (exec_fd >= 0) {
3581 uint8_t hot = 1;
3582
3583 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3584 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3585
3586 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3587 *exit_status = EXIT_EXEC;
3588 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3589 }
3590 }
3591
3592 execve(command->path, final_argv, accum_env);
3593 r = -errno;
3594
3595 if (exec_fd >= 0) {
3596 uint8_t hot = 0;
3597
3598 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3599 * that POLLHUP on it no longer means execve() succeeded. */
3600
3601 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3602 *exit_status = EXIT_EXEC;
3603 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3604 }
3605 }
3606
3607 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3608 log_struct_errno(LOG_INFO, r,
3609 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3610 LOG_UNIT_ID(unit),
3611 LOG_UNIT_INVOCATION_ID(unit),
3612 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3613 command->path),
3614 "EXECUTABLE=%s", command->path);
3615 return 0;
3616 }
3617
3618 *exit_status = EXIT_EXEC;
3619 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3620 }
3621
3622 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3623 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3624
3625 int exec_spawn(Unit *unit,
3626 ExecCommand *command,
3627 const ExecContext *context,
3628 const ExecParameters *params,
3629 ExecRuntime *runtime,
3630 DynamicCreds *dcreds,
3631 pid_t *ret) {
3632
3633 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3634 _cleanup_free_ char *subcgroup_path = NULL;
3635 _cleanup_strv_free_ char **files_env = NULL;
3636 size_t n_storage_fds = 0, n_socket_fds = 0;
3637 _cleanup_free_ char *line = NULL;
3638 pid_t pid;
3639
3640 assert(unit);
3641 assert(command);
3642 assert(context);
3643 assert(ret);
3644 assert(params);
3645 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3646
3647 if (context->std_input == EXEC_INPUT_SOCKET ||
3648 context->std_output == EXEC_OUTPUT_SOCKET ||
3649 context->std_error == EXEC_OUTPUT_SOCKET) {
3650
3651 if (params->n_socket_fds > 1) {
3652 log_unit_error(unit, "Got more than one socket.");
3653 return -EINVAL;
3654 }
3655
3656 if (params->n_socket_fds == 0) {
3657 log_unit_error(unit, "Got no socket.");
3658 return -EINVAL;
3659 }
3660
3661 socket_fd = params->fds[0];
3662 } else {
3663 socket_fd = -1;
3664 fds = params->fds;
3665 n_socket_fds = params->n_socket_fds;
3666 n_storage_fds = params->n_storage_fds;
3667 }
3668
3669 r = exec_context_named_iofds(context, params, named_iofds);
3670 if (r < 0)
3671 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3672
3673 r = exec_context_load_environment(unit, context, &files_env);
3674 if (r < 0)
3675 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3676
3677 line = exec_command_line(command->argv);
3678 if (!line)
3679 return log_oom();
3680
3681 log_struct(LOG_DEBUG,
3682 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3683 "EXECUTABLE=%s", command->path,
3684 LOG_UNIT_ID(unit),
3685 LOG_UNIT_INVOCATION_ID(unit));
3686
3687 if (params->cgroup_path) {
3688 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3689 if (r < 0)
3690 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3691 if (r > 0) { /* We are using a child cgroup */
3692 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3693 if (r < 0)
3694 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3695 }
3696 }
3697
3698 pid = fork();
3699 if (pid < 0)
3700 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3701
3702 if (pid == 0) {
3703 int exit_status = EXIT_SUCCESS;
3704
3705 r = exec_child(unit,
3706 command,
3707 context,
3708 params,
3709 runtime,
3710 dcreds,
3711 socket_fd,
3712 named_iofds,
3713 fds,
3714 n_socket_fds,
3715 n_storage_fds,
3716 files_env,
3717 unit->manager->user_lookup_fds[1],
3718 &exit_status);
3719
3720 if (r < 0)
3721 log_struct_errno(LOG_ERR, r,
3722 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3723 LOG_UNIT_ID(unit),
3724 LOG_UNIT_INVOCATION_ID(unit),
3725 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3726 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3727 command->path),
3728 "EXECUTABLE=%s", command->path);
3729
3730 _exit(exit_status);
3731 }
3732
3733 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3734
3735 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3736 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3737 * process will be killed too). */
3738 if (subcgroup_path)
3739 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3740
3741 exec_status_start(&command->exec_status, pid);
3742
3743 *ret = pid;
3744 return 0;
3745 }
3746
3747 void exec_context_init(ExecContext *c) {
3748 ExecDirectoryType i;
3749
3750 assert(c);
3751
3752 c->umask = 0022;
3753 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3754 c->cpu_sched_policy = SCHED_OTHER;
3755 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3756 c->syslog_level_prefix = true;
3757 c->ignore_sigpipe = true;
3758 c->timer_slack_nsec = NSEC_INFINITY;
3759 c->personality = PERSONALITY_INVALID;
3760 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3761 c->directories[i].mode = 0755;
3762 c->capability_bounding_set = CAP_ALL;
3763 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3764 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3765 c->log_level_max = -1;
3766 }
3767
3768 void exec_context_done(ExecContext *c) {
3769 ExecDirectoryType i;
3770 size_t l;
3771
3772 assert(c);
3773
3774 c->environment = strv_free(c->environment);
3775 c->environment_files = strv_free(c->environment_files);
3776 c->pass_environment = strv_free(c->pass_environment);
3777 c->unset_environment = strv_free(c->unset_environment);
3778
3779 rlimit_free_all(c->rlimit);
3780
3781 for (l = 0; l < 3; l++) {
3782 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3783 c->stdio_file[l] = mfree(c->stdio_file[l]);
3784 }
3785
3786 c->working_directory = mfree(c->working_directory);
3787 c->root_directory = mfree(c->root_directory);
3788 c->root_image = mfree(c->root_image);
3789 c->tty_path = mfree(c->tty_path);
3790 c->syslog_identifier = mfree(c->syslog_identifier);
3791 c->user = mfree(c->user);
3792 c->group = mfree(c->group);
3793
3794 c->supplementary_groups = strv_free(c->supplementary_groups);
3795
3796 c->pam_name = mfree(c->pam_name);
3797
3798 c->read_only_paths = strv_free(c->read_only_paths);
3799 c->read_write_paths = strv_free(c->read_write_paths);
3800 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3801
3802 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3803 c->bind_mounts = NULL;
3804 c->n_bind_mounts = 0;
3805 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3806 c->temporary_filesystems = NULL;
3807 c->n_temporary_filesystems = 0;
3808
3809 c->cpuset = cpu_set_mfree(c->cpuset);
3810
3811 c->utmp_id = mfree(c->utmp_id);
3812 c->selinux_context = mfree(c->selinux_context);
3813 c->apparmor_profile = mfree(c->apparmor_profile);
3814 c->smack_process_label = mfree(c->smack_process_label);
3815
3816 c->syscall_filter = hashmap_free(c->syscall_filter);
3817 c->syscall_archs = set_free(c->syscall_archs);
3818 c->address_families = set_free(c->address_families);
3819
3820 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3821 c->directories[i].paths = strv_free(c->directories[i].paths);
3822
3823 c->log_level_max = -1;
3824
3825 exec_context_free_log_extra_fields(c);
3826
3827 c->log_rate_limit_interval_usec = 0;
3828 c->log_rate_limit_burst = 0;
3829
3830 c->stdin_data = mfree(c->stdin_data);
3831 c->stdin_data_size = 0;
3832 }
3833
3834 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3835 char **i;
3836
3837 assert(c);
3838
3839 if (!runtime_prefix)
3840 return 0;
3841
3842 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3843 _cleanup_free_ char *p;
3844
3845 p = strjoin(runtime_prefix, "/", *i);
3846 if (!p)
3847 return -ENOMEM;
3848
3849 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3850 * next. */
3851 (void) rm_rf(p, REMOVE_ROOT);
3852 }
3853
3854 return 0;
3855 }
3856
3857 static void exec_command_done(ExecCommand *c) {
3858 assert(c);
3859
3860 c->path = mfree(c->path);
3861 c->argv = strv_free(c->argv);
3862 }
3863
3864 void exec_command_done_array(ExecCommand *c, size_t n) {
3865 size_t i;
3866
3867 for (i = 0; i < n; i++)
3868 exec_command_done(c+i);
3869 }
3870
3871 ExecCommand* exec_command_free_list(ExecCommand *c) {
3872 ExecCommand *i;
3873
3874 while ((i = c)) {
3875 LIST_REMOVE(command, c, i);
3876 exec_command_done(i);
3877 free(i);
3878 }
3879
3880 return NULL;
3881 }
3882
3883 void exec_command_free_array(ExecCommand **c, size_t n) {
3884 size_t i;
3885
3886 for (i = 0; i < n; i++)
3887 c[i] = exec_command_free_list(c[i]);
3888 }
3889
3890 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
3891 size_t i;
3892
3893 for (i = 0; i < n; i++)
3894 exec_status_reset(&c[i].exec_status);
3895 }
3896
3897 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
3898 size_t i;
3899
3900 for (i = 0; i < n; i++) {
3901 ExecCommand *z;
3902
3903 LIST_FOREACH(command, z, c[i])
3904 exec_status_reset(&z->exec_status);
3905 }
3906 }
3907
3908 typedef struct InvalidEnvInfo {
3909 const Unit *unit;
3910 const char *path;
3911 } InvalidEnvInfo;
3912
3913 static void invalid_env(const char *p, void *userdata) {
3914 InvalidEnvInfo *info = userdata;
3915
3916 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3917 }
3918
3919 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3920 assert(c);
3921
3922 switch (fd_index) {
3923
3924 case STDIN_FILENO:
3925 if (c->std_input != EXEC_INPUT_NAMED_FD)
3926 return NULL;
3927
3928 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3929
3930 case STDOUT_FILENO:
3931 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3932 return NULL;
3933
3934 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3935
3936 case STDERR_FILENO:
3937 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3938 return NULL;
3939
3940 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3941
3942 default:
3943 return NULL;
3944 }
3945 }
3946
3947 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3948 size_t i, targets;
3949 const char* stdio_fdname[3];
3950 size_t n_fds;
3951
3952 assert(c);
3953 assert(p);
3954
3955 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3956 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3957 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3958
3959 for (i = 0; i < 3; i++)
3960 stdio_fdname[i] = exec_context_fdname(c, i);
3961
3962 n_fds = p->n_storage_fds + p->n_socket_fds;
3963
3964 for (i = 0; i < n_fds && targets > 0; i++)
3965 if (named_iofds[STDIN_FILENO] < 0 &&
3966 c->std_input == EXEC_INPUT_NAMED_FD &&
3967 stdio_fdname[STDIN_FILENO] &&
3968 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3969
3970 named_iofds[STDIN_FILENO] = p->fds[i];
3971 targets--;
3972
3973 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3974 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3975 stdio_fdname[STDOUT_FILENO] &&
3976 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3977
3978 named_iofds[STDOUT_FILENO] = p->fds[i];
3979 targets--;
3980
3981 } else if (named_iofds[STDERR_FILENO] < 0 &&
3982 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3983 stdio_fdname[STDERR_FILENO] &&
3984 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3985
3986 named_iofds[STDERR_FILENO] = p->fds[i];
3987 targets--;
3988 }
3989
3990 return targets == 0 ? 0 : -ENOENT;
3991 }
3992
3993 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3994 char **i, **r = NULL;
3995
3996 assert(c);
3997 assert(l);
3998
3999 STRV_FOREACH(i, c->environment_files) {
4000 char *fn;
4001 int k;
4002 unsigned n;
4003 bool ignore = false;
4004 char **p;
4005 _cleanup_globfree_ glob_t pglob = {};
4006
4007 fn = *i;
4008
4009 if (fn[0] == '-') {
4010 ignore = true;
4011 fn++;
4012 }
4013
4014 if (!path_is_absolute(fn)) {
4015 if (ignore)
4016 continue;
4017
4018 strv_free(r);
4019 return -EINVAL;
4020 }
4021
4022 /* Filename supports globbing, take all matching files */
4023 k = safe_glob(fn, 0, &pglob);
4024 if (k < 0) {
4025 if (ignore)
4026 continue;
4027
4028 strv_free(r);
4029 return k;
4030 }
4031
4032 /* When we don't match anything, -ENOENT should be returned */
4033 assert(pglob.gl_pathc > 0);
4034
4035 for (n = 0; n < pglob.gl_pathc; n++) {
4036 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4037 if (k < 0) {
4038 if (ignore)
4039 continue;
4040
4041 strv_free(r);
4042 return k;
4043 }
4044 /* Log invalid environment variables with filename */
4045 if (p) {
4046 InvalidEnvInfo info = {
4047 .unit = unit,
4048 .path = pglob.gl_pathv[n]
4049 };
4050
4051 p = strv_env_clean_with_callback(p, invalid_env, &info);
4052 }
4053
4054 if (!r)
4055 r = p;
4056 else {
4057 char **m;
4058
4059 m = strv_env_merge(2, r, p);
4060 strv_free(r);
4061 strv_free(p);
4062 if (!m)
4063 return -ENOMEM;
4064
4065 r = m;
4066 }
4067 }
4068 }
4069
4070 *l = r;
4071
4072 return 0;
4073 }
4074
4075 static bool tty_may_match_dev_console(const char *tty) {
4076 _cleanup_free_ char *resolved = NULL;
4077
4078 if (!tty)
4079 return true;
4080
4081 tty = skip_dev_prefix(tty);
4082
4083 /* trivial identity? */
4084 if (streq(tty, "console"))
4085 return true;
4086
4087 if (resolve_dev_console(&resolved) < 0)
4088 return true; /* if we could not resolve, assume it may */
4089
4090 /* "tty0" means the active VC, so it may be the same sometimes */
4091 return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4092 }
4093
4094 bool exec_context_may_touch_console(const ExecContext *ec) {
4095
4096 return (ec->tty_reset ||
4097 ec->tty_vhangup ||
4098 ec->tty_vt_disallocate ||
4099 is_terminal_input(ec->std_input) ||
4100 is_terminal_output(ec->std_output) ||
4101 is_terminal_output(ec->std_error)) &&
4102 tty_may_match_dev_console(exec_context_tty_path(ec));
4103 }
4104
4105 static void strv_fprintf(FILE *f, char **l) {
4106 char **g;
4107
4108 assert(f);
4109
4110 STRV_FOREACH(g, l)
4111 fprintf(f, " %s", *g);
4112 }
4113
4114 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4115 ExecDirectoryType dt;
4116 char **e, **d;
4117 unsigned i;
4118 int r;
4119
4120 assert(c);
4121 assert(f);
4122
4123 prefix = strempty(prefix);
4124
4125 fprintf(f,
4126 "%sUMask: %04o\n"
4127 "%sWorkingDirectory: %s\n"
4128 "%sRootDirectory: %s\n"
4129 "%sNonBlocking: %s\n"
4130 "%sPrivateTmp: %s\n"
4131 "%sPrivateDevices: %s\n"
4132 "%sProtectKernelTunables: %s\n"
4133 "%sProtectKernelModules: %s\n"
4134 "%sProtectControlGroups: %s\n"
4135 "%sPrivateNetwork: %s\n"
4136 "%sPrivateUsers: %s\n"
4137 "%sProtectHome: %s\n"
4138 "%sProtectSystem: %s\n"
4139 "%sMountAPIVFS: %s\n"
4140 "%sIgnoreSIGPIPE: %s\n"
4141 "%sMemoryDenyWriteExecute: %s\n"
4142 "%sRestrictRealtime: %s\n"
4143 "%sKeyringMode: %s\n",
4144 prefix, c->umask,
4145 prefix, c->working_directory ? c->working_directory : "/",
4146 prefix, c->root_directory ? c->root_directory : "/",
4147 prefix, yes_no(c->non_blocking),
4148 prefix, yes_no(c->private_tmp),
4149 prefix, yes_no(c->private_devices),
4150 prefix, yes_no(c->protect_kernel_tunables),
4151 prefix, yes_no(c->protect_kernel_modules),
4152 prefix, yes_no(c->protect_control_groups),
4153 prefix, yes_no(c->private_network),
4154 prefix, yes_no(c->private_users),
4155 prefix, protect_home_to_string(c->protect_home),
4156 prefix, protect_system_to_string(c->protect_system),
4157 prefix, yes_no(c->mount_apivfs),
4158 prefix, yes_no(c->ignore_sigpipe),
4159 prefix, yes_no(c->memory_deny_write_execute),
4160 prefix, yes_no(c->restrict_realtime),
4161 prefix, exec_keyring_mode_to_string(c->keyring_mode));
4162
4163 if (c->root_image)
4164 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4165
4166 STRV_FOREACH(e, c->environment)
4167 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4168
4169 STRV_FOREACH(e, c->environment_files)
4170 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4171
4172 STRV_FOREACH(e, c->pass_environment)
4173 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4174
4175 STRV_FOREACH(e, c->unset_environment)
4176 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4177
4178 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4179
4180 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4181 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4182
4183 STRV_FOREACH(d, c->directories[dt].paths)
4184 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4185 }
4186
4187 if (c->nice_set)
4188 fprintf(f,
4189 "%sNice: %i\n",
4190 prefix, c->nice);
4191
4192 if (c->oom_score_adjust_set)
4193 fprintf(f,
4194 "%sOOMScoreAdjust: %i\n",
4195 prefix, c->oom_score_adjust);
4196
4197 for (i = 0; i < RLIM_NLIMITS; i++)
4198 if (c->rlimit[i]) {
4199 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4200 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4201 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4202 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4203 }
4204
4205 if (c->ioprio_set) {
4206 _cleanup_free_ char *class_str = NULL;
4207
4208 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4209 if (r >= 0)
4210 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4211
4212 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4213 }
4214
4215 if (c->cpu_sched_set) {
4216 _cleanup_free_ char *policy_str = NULL;
4217
4218 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4219 if (r >= 0)
4220 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4221
4222 fprintf(f,
4223 "%sCPUSchedulingPriority: %i\n"
4224 "%sCPUSchedulingResetOnFork: %s\n",
4225 prefix, c->cpu_sched_priority,
4226 prefix, yes_no(c->cpu_sched_reset_on_fork));
4227 }
4228
4229 if (c->cpuset) {
4230 fprintf(f, "%sCPUAffinity:", prefix);
4231 for (i = 0; i < c->cpuset_ncpus; i++)
4232 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4233 fprintf(f, " %u", i);
4234 fputs("\n", f);
4235 }
4236
4237 if (c->timer_slack_nsec != NSEC_INFINITY)
4238 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4239
4240 fprintf(f,
4241 "%sStandardInput: %s\n"
4242 "%sStandardOutput: %s\n"
4243 "%sStandardError: %s\n",
4244 prefix, exec_input_to_string(c->std_input),
4245 prefix, exec_output_to_string(c->std_output),
4246 prefix, exec_output_to_string(c->std_error));
4247
4248 if (c->std_input == EXEC_INPUT_NAMED_FD)
4249 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4250 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4251 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4252 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4253 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4254
4255 if (c->std_input == EXEC_INPUT_FILE)
4256 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4257 if (c->std_output == EXEC_OUTPUT_FILE)
4258 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4259 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4260 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4261 if (c->std_error == EXEC_OUTPUT_FILE)
4262 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4263 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4264 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4265
4266 if (c->tty_path)
4267 fprintf(f,
4268 "%sTTYPath: %s\n"
4269 "%sTTYReset: %s\n"
4270 "%sTTYVHangup: %s\n"
4271 "%sTTYVTDisallocate: %s\n",
4272 prefix, c->tty_path,
4273 prefix, yes_no(c->tty_reset),
4274 prefix, yes_no(c->tty_vhangup),
4275 prefix, yes_no(c->tty_vt_disallocate));
4276
4277 if (IN_SET(c->std_output,
4278 EXEC_OUTPUT_SYSLOG,
4279 EXEC_OUTPUT_KMSG,
4280 EXEC_OUTPUT_JOURNAL,
4281 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4282 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4283 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4284 IN_SET(c->std_error,
4285 EXEC_OUTPUT_SYSLOG,
4286 EXEC_OUTPUT_KMSG,
4287 EXEC_OUTPUT_JOURNAL,
4288 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4289 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4290 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4291
4292 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4293
4294 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4295 if (r >= 0)
4296 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4297
4298 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4299 if (r >= 0)
4300 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4301 }
4302
4303 if (c->log_level_max >= 0) {
4304 _cleanup_free_ char *t = NULL;
4305
4306 (void) log_level_to_string_alloc(c->log_level_max, &t);
4307
4308 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4309 }
4310
4311 if (c->log_rate_limit_interval_usec > 0) {
4312 char buf_timespan[FORMAT_TIMESPAN_MAX];
4313
4314 fprintf(f,
4315 "%sLogRateLimitIntervalSec: %s\n",
4316 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4317 }
4318
4319 if (c->log_rate_limit_burst > 0)
4320 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4321
4322 if (c->n_log_extra_fields > 0) {
4323 size_t j;
4324
4325 for (j = 0; j < c->n_log_extra_fields; j++) {
4326 fprintf(f, "%sLogExtraFields: ", prefix);
4327 fwrite(c->log_extra_fields[j].iov_base,
4328 1, c->log_extra_fields[j].iov_len,
4329 f);
4330 fputc('\n', f);
4331 }
4332 }
4333
4334 if (c->secure_bits) {
4335 _cleanup_free_ char *str = NULL;
4336
4337 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4338 if (r >= 0)
4339 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4340 }
4341
4342 if (c->capability_bounding_set != CAP_ALL) {
4343 _cleanup_free_ char *str = NULL;
4344
4345 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4346 if (r >= 0)
4347 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4348 }
4349
4350 if (c->capability_ambient_set != 0) {
4351 _cleanup_free_ char *str = NULL;
4352
4353 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4354 if (r >= 0)
4355 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4356 }
4357
4358 if (c->user)
4359 fprintf(f, "%sUser: %s\n", prefix, c->user);
4360 if (c->group)
4361 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4362
4363 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4364
4365 if (!strv_isempty(c->supplementary_groups)) {
4366 fprintf(f, "%sSupplementaryGroups:", prefix);
4367 strv_fprintf(f, c->supplementary_groups);
4368 fputs("\n", f);
4369 }
4370
4371 if (c->pam_name)
4372 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4373
4374 if (!strv_isempty(c->read_write_paths)) {
4375 fprintf(f, "%sReadWritePaths:", prefix);
4376 strv_fprintf(f, c->read_write_paths);
4377 fputs("\n", f);
4378 }
4379
4380 if (!strv_isempty(c->read_only_paths)) {
4381 fprintf(f, "%sReadOnlyPaths:", prefix);
4382 strv_fprintf(f, c->read_only_paths);
4383 fputs("\n", f);
4384 }
4385
4386 if (!strv_isempty(c->inaccessible_paths)) {
4387 fprintf(f, "%sInaccessiblePaths:", prefix);
4388 strv_fprintf(f, c->inaccessible_paths);
4389 fputs("\n", f);
4390 }
4391
4392 if (c->n_bind_mounts > 0)
4393 for (i = 0; i < c->n_bind_mounts; i++)
4394 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4395 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4396 c->bind_mounts[i].ignore_enoent ? "-": "",
4397 c->bind_mounts[i].source,
4398 c->bind_mounts[i].destination,
4399 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4400
4401 if (c->n_temporary_filesystems > 0)
4402 for (i = 0; i < c->n_temporary_filesystems; i++) {
4403 TemporaryFileSystem *t = c->temporary_filesystems + i;
4404
4405 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4406 t->path,
4407 isempty(t->options) ? "" : ":",
4408 strempty(t->options));
4409 }
4410
4411 if (c->utmp_id)
4412 fprintf(f,
4413 "%sUtmpIdentifier: %s\n",
4414 prefix, c->utmp_id);
4415
4416 if (c->selinux_context)
4417 fprintf(f,
4418 "%sSELinuxContext: %s%s\n",
4419 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4420
4421 if (c->apparmor_profile)
4422 fprintf(f,
4423 "%sAppArmorProfile: %s%s\n",
4424 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4425
4426 if (c->smack_process_label)
4427 fprintf(f,
4428 "%sSmackProcessLabel: %s%s\n",
4429 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4430
4431 if (c->personality != PERSONALITY_INVALID)
4432 fprintf(f,
4433 "%sPersonality: %s\n",
4434 prefix, strna(personality_to_string(c->personality)));
4435
4436 fprintf(f,
4437 "%sLockPersonality: %s\n",
4438 prefix, yes_no(c->lock_personality));
4439
4440 if (c->syscall_filter) {
4441 #if HAVE_SECCOMP
4442 Iterator j;
4443 void *id, *val;
4444 bool first = true;
4445 #endif
4446
4447 fprintf(f,
4448 "%sSystemCallFilter: ",
4449 prefix);
4450
4451 if (!c->syscall_whitelist)
4452 fputc('~', f);
4453
4454 #if HAVE_SECCOMP
4455 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4456 _cleanup_free_ char *name = NULL;
4457 const char *errno_name = NULL;
4458 int num = PTR_TO_INT(val);
4459
4460 if (first)
4461 first = false;
4462 else
4463 fputc(' ', f);
4464
4465 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4466 fputs(strna(name), f);
4467
4468 if (num >= 0) {
4469 errno_name = errno_to_name(num);
4470 if (errno_name)
4471 fprintf(f, ":%s", errno_name);
4472 else
4473 fprintf(f, ":%d", num);
4474 }
4475 }
4476 #endif
4477
4478 fputc('\n', f);
4479 }
4480
4481 if (c->syscall_archs) {
4482 #if HAVE_SECCOMP
4483 Iterator j;
4484 void *id;
4485 #endif
4486
4487 fprintf(f,
4488 "%sSystemCallArchitectures:",
4489 prefix);
4490
4491 #if HAVE_SECCOMP
4492 SET_FOREACH(id, c->syscall_archs, j)
4493 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4494 #endif
4495 fputc('\n', f);
4496 }
4497
4498 if (exec_context_restrict_namespaces_set(c)) {
4499 _cleanup_free_ char *s = NULL;
4500
4501 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4502 if (r >= 0)
4503 fprintf(f, "%sRestrictNamespaces: %s\n",
4504 prefix, s);
4505 }
4506
4507 if (c->syscall_errno > 0) {
4508 const char *errno_name;
4509
4510 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4511
4512 errno_name = errno_to_name(c->syscall_errno);
4513 if (errno_name)
4514 fprintf(f, "%s\n", errno_name);
4515 else
4516 fprintf(f, "%d\n", c->syscall_errno);
4517 }
4518
4519 if (c->apparmor_profile)
4520 fprintf(f,
4521 "%sAppArmorProfile: %s%s\n",
4522 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4523 }
4524
4525 bool exec_context_maintains_privileges(const ExecContext *c) {
4526 assert(c);
4527
4528 /* Returns true if the process forked off would run under
4529 * an unchanged UID or as root. */
4530
4531 if (!c->user)
4532 return true;
4533
4534 if (streq(c->user, "root") || streq(c->user, "0"))
4535 return true;
4536
4537 return false;
4538 }
4539
4540 int exec_context_get_effective_ioprio(const ExecContext *c) {
4541 int p;
4542
4543 assert(c);
4544
4545 if (c->ioprio_set)
4546 return c->ioprio;
4547
4548 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4549 if (p < 0)
4550 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4551
4552 return p;
4553 }
4554
4555 void exec_context_free_log_extra_fields(ExecContext *c) {
4556 size_t l;
4557
4558 assert(c);
4559
4560 for (l = 0; l < c->n_log_extra_fields; l++)
4561 free(c->log_extra_fields[l].iov_base);
4562 c->log_extra_fields = mfree(c->log_extra_fields);
4563 c->n_log_extra_fields = 0;
4564 }
4565
4566 void exec_status_start(ExecStatus *s, pid_t pid) {
4567 assert(s);
4568
4569 *s = (ExecStatus) {
4570 .pid = pid,
4571 };
4572
4573 dual_timestamp_get(&s->start_timestamp);
4574 }
4575
4576 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4577 assert(s);
4578
4579 if (s->pid != pid) {
4580 *s = (ExecStatus) {
4581 .pid = pid,
4582 };
4583 }
4584
4585 dual_timestamp_get(&s->exit_timestamp);
4586
4587 s->code = code;
4588 s->status = status;
4589
4590 if (context) {
4591 if (context->utmp_id)
4592 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4593
4594 exec_context_tty_reset(context, NULL);
4595 }
4596 }
4597
4598 void exec_status_reset(ExecStatus *s) {
4599 assert(s);
4600
4601 *s = (ExecStatus) {};
4602 }
4603
4604 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4605 char buf[FORMAT_TIMESTAMP_MAX];
4606
4607 assert(s);
4608 assert(f);
4609
4610 if (s->pid <= 0)
4611 return;
4612
4613 prefix = strempty(prefix);
4614
4615 fprintf(f,
4616 "%sPID: "PID_FMT"\n",
4617 prefix, s->pid);
4618
4619 if (dual_timestamp_is_set(&s->start_timestamp))
4620 fprintf(f,
4621 "%sStart Timestamp: %s\n",
4622 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4623
4624 if (dual_timestamp_is_set(&s->exit_timestamp))
4625 fprintf(f,
4626 "%sExit Timestamp: %s\n"
4627 "%sExit Code: %s\n"
4628 "%sExit Status: %i\n",
4629 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4630 prefix, sigchld_code_to_string(s->code),
4631 prefix, s->status);
4632 }
4633
4634 static char *exec_command_line(char **argv) {
4635 size_t k;
4636 char *n, *p, **a;
4637 bool first = true;
4638
4639 assert(argv);
4640
4641 k = 1;
4642 STRV_FOREACH(a, argv)
4643 k += strlen(*a)+3;
4644
4645 n = new(char, k);
4646 if (!n)
4647 return NULL;
4648
4649 p = n;
4650 STRV_FOREACH(a, argv) {
4651
4652 if (!first)
4653 *(p++) = ' ';
4654 else
4655 first = false;
4656
4657 if (strpbrk(*a, WHITESPACE)) {
4658 *(p++) = '\'';
4659 p = stpcpy(p, *a);
4660 *(p++) = '\'';
4661 } else
4662 p = stpcpy(p, *a);
4663
4664 }
4665
4666 *p = 0;
4667
4668 /* FIXME: this doesn't really handle arguments that have
4669 * spaces and ticks in them */
4670
4671 return n;
4672 }
4673
4674 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4675 _cleanup_free_ char *cmd = NULL;
4676 const char *prefix2;
4677
4678 assert(c);
4679 assert(f);
4680
4681 prefix = strempty(prefix);
4682 prefix2 = strjoina(prefix, "\t");
4683
4684 cmd = exec_command_line(c->argv);
4685 fprintf(f,
4686 "%sCommand Line: %s\n",
4687 prefix, cmd ? cmd : strerror(ENOMEM));
4688
4689 exec_status_dump(&c->exec_status, f, prefix2);
4690 }
4691
4692 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4693 assert(f);
4694
4695 prefix = strempty(prefix);
4696
4697 LIST_FOREACH(command, c, c)
4698 exec_command_dump(c, f, prefix);
4699 }
4700
4701 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4702 ExecCommand *end;
4703
4704 assert(l);
4705 assert(e);
4706
4707 if (*l) {
4708 /* It's kind of important, that we keep the order here */
4709 LIST_FIND_TAIL(command, *l, end);
4710 LIST_INSERT_AFTER(command, *l, end, e);
4711 } else
4712 *l = e;
4713 }
4714
4715 int exec_command_set(ExecCommand *c, const char *path, ...) {
4716 va_list ap;
4717 char **l, *p;
4718
4719 assert(c);
4720 assert(path);
4721
4722 va_start(ap, path);
4723 l = strv_new_ap(path, ap);
4724 va_end(ap);
4725
4726 if (!l)
4727 return -ENOMEM;
4728
4729 p = strdup(path);
4730 if (!p) {
4731 strv_free(l);
4732 return -ENOMEM;
4733 }
4734
4735 free_and_replace(c->path, p);
4736
4737 return strv_free_and_replace(c->argv, l);
4738 }
4739
4740 int exec_command_append(ExecCommand *c, const char *path, ...) {
4741 _cleanup_strv_free_ char **l = NULL;
4742 va_list ap;
4743 int r;
4744
4745 assert(c);
4746 assert(path);
4747
4748 va_start(ap, path);
4749 l = strv_new_ap(path, ap);
4750 va_end(ap);
4751
4752 if (!l)
4753 return -ENOMEM;
4754
4755 r = strv_extend_strv(&c->argv, l, false);
4756 if (r < 0)
4757 return r;
4758
4759 return 0;
4760 }
4761
4762 static void *remove_tmpdir_thread(void *p) {
4763 _cleanup_free_ char *path = p;
4764
4765 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4766 return NULL;
4767 }
4768
4769 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4770 int r;
4771
4772 if (!rt)
4773 return NULL;
4774
4775 if (rt->manager)
4776 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4777
4778 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4779 if (destroy && rt->tmp_dir) {
4780 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4781
4782 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4783 if (r < 0) {
4784 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4785 free(rt->tmp_dir);
4786 }
4787
4788 rt->tmp_dir = NULL;
4789 }
4790
4791 if (destroy && rt->var_tmp_dir) {
4792 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4793
4794 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4795 if (r < 0) {
4796 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4797 free(rt->var_tmp_dir);
4798 }
4799
4800 rt->var_tmp_dir = NULL;
4801 }
4802
4803 rt->id = mfree(rt->id);
4804 rt->tmp_dir = mfree(rt->tmp_dir);
4805 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4806 safe_close_pair(rt->netns_storage_socket);
4807 return mfree(rt);
4808 }
4809
4810 static void exec_runtime_freep(ExecRuntime **rt) {
4811 if (*rt)
4812 (void) exec_runtime_free(*rt, false);
4813 }
4814
4815 static int exec_runtime_allocate(ExecRuntime **rt) {
4816 assert(rt);
4817
4818 *rt = new0(ExecRuntime, 1);
4819 if (!*rt)
4820 return -ENOMEM;
4821
4822 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4823 return 0;
4824 }
4825
4826 static int exec_runtime_add(
4827 Manager *m,
4828 const char *id,
4829 const char *tmp_dir,
4830 const char *var_tmp_dir,
4831 const int netns_storage_socket[2],
4832 ExecRuntime **ret) {
4833
4834 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4835 int r;
4836
4837 assert(m);
4838 assert(id);
4839
4840 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4841 if (r < 0)
4842 return r;
4843
4844 r = exec_runtime_allocate(&rt);
4845 if (r < 0)
4846 return r;
4847
4848 rt->id = strdup(id);
4849 if (!rt->id)
4850 return -ENOMEM;
4851
4852 if (tmp_dir) {
4853 rt->tmp_dir = strdup(tmp_dir);
4854 if (!rt->tmp_dir)
4855 return -ENOMEM;
4856
4857 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4858 assert(var_tmp_dir);
4859 rt->var_tmp_dir = strdup(var_tmp_dir);
4860 if (!rt->var_tmp_dir)
4861 return -ENOMEM;
4862 }
4863
4864 if (netns_storage_socket) {
4865 rt->netns_storage_socket[0] = netns_storage_socket[0];
4866 rt->netns_storage_socket[1] = netns_storage_socket[1];
4867 }
4868
4869 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4870 if (r < 0)
4871 return r;
4872
4873 rt->manager = m;
4874
4875 if (ret)
4876 *ret = rt;
4877
4878 /* do not remove created ExecRuntime object when the operation succeeds. */
4879 rt = NULL;
4880 return 0;
4881 }
4882
4883 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4884 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4885 _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4886 int r;
4887
4888 assert(m);
4889 assert(c);
4890 assert(id);
4891
4892 /* It is not necessary to create ExecRuntime object. */
4893 if (!c->private_network && !c->private_tmp)
4894 return 0;
4895
4896 if (c->private_tmp) {
4897 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4898 if (r < 0)
4899 return r;
4900 }
4901
4902 if (c->private_network) {
4903 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4904 return -errno;
4905 }
4906
4907 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4908 if (r < 0)
4909 return r;
4910
4911 /* Avoid cleanup */
4912 netns_storage_socket[0] = -1;
4913 netns_storage_socket[1] = -1;
4914 return 1;
4915 }
4916
4917 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4918 ExecRuntime *rt;
4919 int r;
4920
4921 assert(m);
4922 assert(id);
4923 assert(ret);
4924
4925 rt = hashmap_get(m->exec_runtime_by_id, id);
4926 if (rt)
4927 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4928 goto ref;
4929
4930 if (!create)
4931 return 0;
4932
4933 /* If not found, then create a new object. */
4934 r = exec_runtime_make(m, c, id, &rt);
4935 if (r <= 0)
4936 /* When r == 0, it is not necessary to create ExecRuntime object. */
4937 return r;
4938
4939 ref:
4940 /* increment reference counter. */
4941 rt->n_ref++;
4942 *ret = rt;
4943 return 1;
4944 }
4945
4946 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4947 if (!rt)
4948 return NULL;
4949
4950 assert(rt->n_ref > 0);
4951
4952 rt->n_ref--;
4953 if (rt->n_ref > 0)
4954 return NULL;
4955
4956 return exec_runtime_free(rt, destroy);
4957 }
4958
4959 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4960 ExecRuntime *rt;
4961 Iterator i;
4962
4963 assert(m);
4964 assert(f);
4965 assert(fds);
4966
4967 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4968 fprintf(f, "exec-runtime=%s", rt->id);
4969
4970 if (rt->tmp_dir)
4971 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4972
4973 if (rt->var_tmp_dir)
4974 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4975
4976 if (rt->netns_storage_socket[0] >= 0) {
4977 int copy;
4978
4979 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4980 if (copy < 0)
4981 return copy;
4982
4983 fprintf(f, " netns-socket-0=%i", copy);
4984 }
4985
4986 if (rt->netns_storage_socket[1] >= 0) {
4987 int copy;
4988
4989 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4990 if (copy < 0)
4991 return copy;
4992
4993 fprintf(f, " netns-socket-1=%i", copy);
4994 }
4995
4996 fputc('\n', f);
4997 }
4998
4999 return 0;
5000 }
5001
5002 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5003 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5004 ExecRuntime *rt;
5005 int r;
5006
5007 /* This is for the migration from old (v237 or earlier) deserialization text.
5008 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5009 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5010 * so or not from the serialized text, then we always creates a new object owned by this. */
5011
5012 assert(u);
5013 assert(key);
5014 assert(value);
5015
5016 /* Manager manages ExecRuntime objects by the unit id.
5017 * So, we omit the serialized text when the unit does not have id (yet?)... */
5018 if (isempty(u->id)) {
5019 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5020 return 0;
5021 }
5022
5023 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5024 if (r < 0) {
5025 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5026 return 0;
5027 }
5028
5029 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5030 if (!rt) {
5031 r = exec_runtime_allocate(&rt_create);
5032 if (r < 0)
5033 return log_oom();
5034
5035 rt_create->id = strdup(u->id);
5036 if (!rt_create->id)
5037 return log_oom();
5038
5039 rt = rt_create;
5040 }
5041
5042 if (streq(key, "tmp-dir")) {
5043 char *copy;
5044
5045 copy = strdup(value);
5046 if (!copy)
5047 return log_oom();
5048
5049 free_and_replace(rt->tmp_dir, copy);
5050
5051 } else if (streq(key, "var-tmp-dir")) {
5052 char *copy;
5053
5054 copy = strdup(value);
5055 if (!copy)
5056 return log_oom();
5057
5058 free_and_replace(rt->var_tmp_dir, copy);
5059
5060 } else if (streq(key, "netns-socket-0")) {
5061 int fd;
5062
5063 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5064 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5065 return 0;
5066 }
5067
5068 safe_close(rt->netns_storage_socket[0]);
5069 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5070
5071 } else if (streq(key, "netns-socket-1")) {
5072 int fd;
5073
5074 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5075 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5076 return 0;
5077 }
5078
5079 safe_close(rt->netns_storage_socket[1]);
5080 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5081 } else
5082 return 0;
5083
5084 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5085 if (rt_create) {
5086 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5087 if (r < 0) {
5088 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5089 return 0;
5090 }
5091
5092 rt_create->manager = u->manager;
5093
5094 /* Avoid cleanup */
5095 rt_create = NULL;
5096 }
5097
5098 return 1;
5099 }
5100
5101 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5102 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5103 int r, fd0 = -1, fd1 = -1;
5104 const char *p, *v = value;
5105 size_t n;
5106
5107 assert(m);
5108 assert(value);
5109 assert(fds);
5110
5111 n = strcspn(v, " ");
5112 id = strndupa(v, n);
5113 if (v[n] != ' ')
5114 goto finalize;
5115 p = v + n + 1;
5116
5117 v = startswith(p, "tmp-dir=");
5118 if (v) {
5119 n = strcspn(v, " ");
5120 tmp_dir = strndupa(v, n);
5121 if (v[n] != ' ')
5122 goto finalize;
5123 p = v + n + 1;
5124 }
5125
5126 v = startswith(p, "var-tmp-dir=");
5127 if (v) {
5128 n = strcspn(v, " ");
5129 var_tmp_dir = strndupa(v, n);
5130 if (v[n] != ' ')
5131 goto finalize;
5132 p = v + n + 1;
5133 }
5134
5135 v = startswith(p, "netns-socket-0=");
5136 if (v) {
5137 char *buf;
5138
5139 n = strcspn(v, " ");
5140 buf = strndupa(v, n);
5141 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5142 log_debug("Unable to process exec-runtime netns fd specification.");
5143 return;
5144 }
5145 fd0 = fdset_remove(fds, fd0);
5146 if (v[n] != ' ')
5147 goto finalize;
5148 p = v + n + 1;
5149 }
5150
5151 v = startswith(p, "netns-socket-1=");
5152 if (v) {
5153 char *buf;
5154
5155 n = strcspn(v, " ");
5156 buf = strndupa(v, n);
5157 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5158 log_debug("Unable to process exec-runtime netns fd specification.");
5159 return;
5160 }
5161 fd1 = fdset_remove(fds, fd1);
5162 }
5163
5164 finalize:
5165
5166 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5167 if (r < 0)
5168 log_debug_errno(r, "Failed to add exec-runtime: %m");
5169 }
5170
5171 void exec_runtime_vacuum(Manager *m) {
5172 ExecRuntime *rt;
5173 Iterator i;
5174
5175 assert(m);
5176
5177 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5178
5179 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5180 if (rt->n_ref > 0)
5181 continue;
5182
5183 (void) exec_runtime_free(rt, false);
5184 }
5185 }
5186
5187 void exec_params_clear(ExecParameters *p) {
5188 if (!p)
5189 return;
5190
5191 strv_free(p->environment);
5192 }
5193
5194 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5195 [EXEC_INPUT_NULL] = "null",
5196 [EXEC_INPUT_TTY] = "tty",
5197 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5198 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5199 [EXEC_INPUT_SOCKET] = "socket",
5200 [EXEC_INPUT_NAMED_FD] = "fd",
5201 [EXEC_INPUT_DATA] = "data",
5202 [EXEC_INPUT_FILE] = "file",
5203 };
5204
5205 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5206
5207 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5208 [EXEC_OUTPUT_INHERIT] = "inherit",
5209 [EXEC_OUTPUT_NULL] = "null",
5210 [EXEC_OUTPUT_TTY] = "tty",
5211 [EXEC_OUTPUT_SYSLOG] = "syslog",
5212 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5213 [EXEC_OUTPUT_KMSG] = "kmsg",
5214 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5215 [EXEC_OUTPUT_JOURNAL] = "journal",
5216 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5217 [EXEC_OUTPUT_SOCKET] = "socket",
5218 [EXEC_OUTPUT_NAMED_FD] = "fd",
5219 [EXEC_OUTPUT_FILE] = "file",
5220 [EXEC_OUTPUT_FILE_APPEND] = "append",
5221 };
5222
5223 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5224
5225 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5226 [EXEC_UTMP_INIT] = "init",
5227 [EXEC_UTMP_LOGIN] = "login",
5228 [EXEC_UTMP_USER] = "user",
5229 };
5230
5231 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5232
5233 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5234 [EXEC_PRESERVE_NO] = "no",
5235 [EXEC_PRESERVE_YES] = "yes",
5236 [EXEC_PRESERVE_RESTART] = "restart",
5237 };
5238
5239 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5240
5241 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5242 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5243 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5244 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5245 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5246 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5247 };
5248
5249 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5250
5251 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5252 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5253 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5254 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5255 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5256 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5257 };
5258
5259 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5260
5261 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5262 [EXEC_KEYRING_INHERIT] = "inherit",
5263 [EXEC_KEYRING_PRIVATE] = "private",
5264 [EXEC_KEYRING_SHARED] = "shared",
5265 };
5266
5267 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);