]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/exec-invoke.c
execute: add new helper exec_context_apply_tty_size()
[thirdparty/systemd.git] / src / core / exec-invoke.c
CommitLineData
75689fb2
LB
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <sys/eventfd.h>
4#include <sys/ioctl.h>
5#include <sys/mount.h>
6#include <sys/prctl.h>
7
8#if HAVE_PAM
9#include <security/pam_appl.h>
10#include <security/pam_misc.h>
11#endif
12
13#if HAVE_APPARMOR
14#include <sys/apparmor.h>
15#endif
16
17#include "sd-messages.h"
18
19#if HAVE_APPARMOR
20#include "apparmor-util.h"
21#endif
22#include "argv-util.h"
23#include "barrier.h"
24#include "bpf-dlopen.h"
25#include "bpf-lsm.h"
26#include "btrfs-util.h"
27#include "capability-util.h"
28#include "cgroup-setup.h"
29#include "chase.h"
30#include "chattr-util.h"
31#include "chown-recursive.h"
32#include "copy.h"
33#include "data-fd-util.h"
34#include "env-util.h"
35#include "escape.h"
36#include "exec-credential.h"
37#include "exec-invoke.h"
38#include "execute.h"
39#include "exit-status.h"
40#include "fd-util.h"
41#include "hexdecoct.h"
42#include "io-util.h"
bd1ae178 43#include "iovec-util.h"
75689fb2
LB
44#include "missing_ioprio.h"
45#include "missing_prctl.h"
46#include "missing_securebits.h"
47#include "missing_syscall.h"
48#include "mkdir-label.h"
49#include "proc-cmdline.h"
50#include "process-util.h"
51#include "psi-util.h"
52#include "rlimit-util.h"
53#include "seccomp-util.h"
54#include "selinux-util.h"
55#include "signal-util.h"
56#include "smack-util.h"
57#include "socket-util.h"
58#include "string-table.h"
59#include "strv.h"
60#include "terminal-util.h"
61#include "utmp-wtmp.h"
62
63#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
64#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
65
66#define SNDBUF_SIZE (8*1024*1024)
67
68static int shift_fds(int fds[], size_t n_fds) {
69 if (n_fds <= 0)
70 return 0;
71
72 /* Modifies the fds array! (sorts it) */
73
74 assert(fds);
75
76 for (int start = 0;;) {
77 int restart_from = -1;
78
79 for (int i = start; i < (int) n_fds; i++) {
80 int nfd;
81
82 /* Already at right index? */
83 if (fds[i] == i+3)
84 continue;
85
86 nfd = fcntl(fds[i], F_DUPFD, i + 3);
87 if (nfd < 0)
88 return -errno;
89
90 safe_close(fds[i]);
91 fds[i] = nfd;
92
93 /* Hmm, the fd we wanted isn't free? Then
94 * let's remember that and try again from here */
95 if (nfd != i+3 && restart_from < 0)
96 restart_from = i;
97 }
98
99 if (restart_from < 0)
100 break;
101
102 start = restart_from;
103 }
104
105 return 0;
106}
107
108static int flags_fds(
109 const int fds[],
110 size_t n_socket_fds,
111 size_t n_fds,
112 bool nonblock) {
113
114 int r;
115
116 if (n_fds <= 0)
117 return 0;
118
119 assert(fds);
120
121 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
122 * O_NONBLOCK only applies to socket activation though. */
123
124 for (size_t i = 0; i < n_fds; i++) {
125
126 if (i < n_socket_fds) {
127 r = fd_nonblock(fds[i], nonblock);
128 if (r < 0)
129 return r;
130 }
131
132 /* We unconditionally drop FD_CLOEXEC from the fds,
133 * since after all we want to pass these fds to our
134 * children */
135
136 r = fd_cloexec(fds[i], false);
137 if (r < 0)
138 return r;
139 }
140
141 return 0;
142}
143
144static bool is_terminal_input(ExecInput i) {
145 return IN_SET(i,
146 EXEC_INPUT_TTY,
147 EXEC_INPUT_TTY_FORCE,
148 EXEC_INPUT_TTY_FAIL);
149}
150
151static bool is_terminal_output(ExecOutput o) {
152 return IN_SET(o,
153 EXEC_OUTPUT_TTY,
154 EXEC_OUTPUT_KMSG_AND_CONSOLE,
155 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
156}
157
158static bool is_kmsg_output(ExecOutput o) {
159 return IN_SET(o,
160 EXEC_OUTPUT_KMSG,
161 EXEC_OUTPUT_KMSG_AND_CONSOLE);
162}
163
164static bool exec_context_needs_term(const ExecContext *c) {
165 assert(c);
166
167 /* Return true if the execution context suggests we should set $TERM to something useful. */
168
169 if (is_terminal_input(c->std_input))
170 return true;
171
172 if (is_terminal_output(c->std_output))
173 return true;
174
175 if (is_terminal_output(c->std_error))
176 return true;
177
178 return !!c->tty_path;
179}
180
181static int open_null_as(int flags, int nfd) {
182 int fd;
183
184 assert(nfd >= 0);
185
186 fd = open("/dev/null", flags|O_NOCTTY);
187 if (fd < 0)
188 return -errno;
189
190 return move_fd(fd, nfd, false);
191}
192
193static int connect_journal_socket(
194 int fd,
195 const char *log_namespace,
196 uid_t uid,
197 gid_t gid) {
198
199 uid_t olduid = UID_INVALID;
200 gid_t oldgid = GID_INVALID;
201 const char *j;
202 int r;
203
204 j = log_namespace ?
205 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
206 "/run/systemd/journal/stdout";
207
208 if (gid_is_valid(gid)) {
209 oldgid = getgid();
210
211 if (setegid(gid) < 0)
212 return -errno;
213 }
214
215 if (uid_is_valid(uid)) {
216 olduid = getuid();
217
218 if (seteuid(uid) < 0) {
219 r = -errno;
220 goto restore_gid;
221 }
222 }
223
224 r = connect_unix_path(fd, AT_FDCWD, j);
225
226 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
227 an LSM interferes. */
228
229 if (uid_is_valid(uid))
230 (void) seteuid(olduid);
231
232 restore_gid:
233 if (gid_is_valid(gid))
234 (void) setegid(oldgid);
235
236 return r;
237}
238
239static int connect_logger_as(
240 const ExecContext *context,
241 const ExecParameters *params,
242 ExecOutput output,
243 const char *ident,
244 int nfd,
245 uid_t uid,
246 gid_t gid) {
247
248 _cleanup_close_ int fd = -EBADF;
249 int r;
250
251 assert(context);
252 assert(params);
253 assert(output < _EXEC_OUTPUT_MAX);
254 assert(ident);
255 assert(nfd >= 0);
256
257 fd = socket(AF_UNIX, SOCK_STREAM, 0);
258 if (fd < 0)
259 return -errno;
260
261 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
262 if (r < 0)
263 return r;
264
265 if (shutdown(fd, SHUT_RD) < 0)
266 return -errno;
267
268 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
269
270 if (dprintf(fd,
271 "%s\n"
272 "%s\n"
273 "%i\n"
274 "%i\n"
275 "%i\n"
276 "%i\n"
277 "%i\n",
278 context->syslog_identifier ?: ident,
279 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
280 context->syslog_priority,
281 !!context->syslog_level_prefix,
282 false,
283 is_kmsg_output(output),
284 is_terminal_output(output)) < 0)
285 return -errno;
286
287 return move_fd(TAKE_FD(fd), nfd, false);
288}
289
290static int open_terminal_as(const char *path, int flags, int nfd) {
291 int fd;
292
293 assert(path);
294 assert(nfd >= 0);
295
296 fd = open_terminal(path, flags | O_NOCTTY);
297 if (fd < 0)
298 return fd;
299
300 return move_fd(fd, nfd, false);
301}
302
303static int acquire_path(const char *path, int flags, mode_t mode) {
304 _cleanup_close_ int fd = -EBADF;
305 int r;
306
307 assert(path);
308
309 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
310 flags |= O_CREAT;
311
312 fd = open(path, flags|O_NOCTTY, mode);
313 if (fd >= 0)
314 return TAKE_FD(fd);
315
316 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
317 return -errno;
318
319 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
320
321 fd = socket(AF_UNIX, SOCK_STREAM, 0);
322 if (fd < 0)
323 return -errno;
324
325 r = connect_unix_path(fd, AT_FDCWD, path);
326 if (IN_SET(r, -ENOTSOCK, -EINVAL))
327 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
328 * wasn't an AF_UNIX socket after all */
329 return -ENXIO;
330 if (r < 0)
331 return r;
332
333 if ((flags & O_ACCMODE) == O_RDONLY)
334 r = shutdown(fd, SHUT_WR);
335 else if ((flags & O_ACCMODE) == O_WRONLY)
336 r = shutdown(fd, SHUT_RD);
337 else
338 r = 0;
339 if (r < 0)
340 return -errno;
341
342 return TAKE_FD(fd);
343}
344
345static int fixup_input(
346 const ExecContext *context,
347 int socket_fd,
348 bool apply_tty_stdin) {
349
350 ExecInput std_input;
351
352 assert(context);
353
354 std_input = context->std_input;
355
356 if (is_terminal_input(std_input) && !apply_tty_stdin)
357 return EXEC_INPUT_NULL;
358
359 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
360 return EXEC_INPUT_NULL;
361
362 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
363 return EXEC_INPUT_NULL;
364
365 return std_input;
366}
367
368static int fixup_output(ExecOutput output, int socket_fd) {
369
370 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
371 return EXEC_OUTPUT_INHERIT;
372
373 return output;
374}
375
376static int setup_input(
377 const ExecContext *context,
378 const ExecParameters *params,
379 int socket_fd,
380 const int named_iofds[static 3]) {
381
382 ExecInput i;
383 int r;
384
385 assert(context);
386 assert(params);
387 assert(named_iofds);
388
389 if (params->stdin_fd >= 0) {
390 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
391 return -errno;
392
393 /* Try to make this the controlling tty, if it is a tty, and reset it */
394 if (isatty(STDIN_FILENO)) {
75689fb2
LB
395 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
396 (void) reset_terminal_fd(STDIN_FILENO, true);
d2b9e755 397 (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
75689fb2
LB
398 }
399
400 return STDIN_FILENO;
401 }
402
403 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
404
405 switch (i) {
406
407 case EXEC_INPUT_NULL:
408 return open_null_as(O_RDONLY, STDIN_FILENO);
409
410 case EXEC_INPUT_TTY:
411 case EXEC_INPUT_TTY_FORCE:
412 case EXEC_INPUT_TTY_FAIL: {
d2b9e755
LP
413 _cleanup_close_ int tty_fd = -EBADF;
414 const char *tty_path;
75689fb2 415
d2b9e755
LP
416 tty_path = ASSERT_PTR(exec_context_tty_path(context));
417
418 tty_fd = acquire_terminal(tty_path,
419 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
420 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
421 ACQUIRE_TERMINAL_WAIT,
422 USEC_INFINITY);
423 if (tty_fd < 0)
424 return tty_fd;
75689fb2 425
d2b9e755 426 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
75689fb2
LB
427 if (r < 0)
428 return r;
429
d2b9e755 430 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
75689fb2
LB
431 if (r < 0)
432 return r;
433
d2b9e755
LP
434 TAKE_FD(tty_fd);
435 return r;
75689fb2
LB
436 }
437
438 case EXEC_INPUT_SOCKET:
439 assert(socket_fd >= 0);
440
441 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
442
443 case EXEC_INPUT_NAMED_FD:
444 assert(named_iofds[STDIN_FILENO] >= 0);
445
446 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
447 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
448
449 case EXEC_INPUT_DATA: {
450 int fd;
451
452 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
453 if (fd < 0)
454 return fd;
455
456 return move_fd(fd, STDIN_FILENO, false);
457 }
458
459 case EXEC_INPUT_FILE: {
460 bool rw;
461 int fd;
462
463 assert(context->stdio_file[STDIN_FILENO]);
464
465 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
466 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
467
468 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
469 if (fd < 0)
470 return fd;
471
472 return move_fd(fd, STDIN_FILENO, false);
473 }
474
475 default:
476 assert_not_reached();
477 }
478}
479
480static bool can_inherit_stderr_from_stdout(
481 const ExecContext *context,
482 ExecOutput o,
483 ExecOutput e) {
484
485 assert(context);
486
487 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
488 * stderr fd */
489
490 if (e == EXEC_OUTPUT_INHERIT)
491 return true;
492 if (e != o)
493 return false;
494
495 if (e == EXEC_OUTPUT_NAMED_FD)
496 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
497
498 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
499 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
500
501 return true;
502}
503
504static int setup_output(
505 const ExecContext *context,
506 const ExecParameters *params,
507 int fileno,
508 int socket_fd,
509 const int named_iofds[static 3],
510 const char *ident,
511 uid_t uid,
512 gid_t gid,
513 dev_t *journal_stream_dev,
514 ino_t *journal_stream_ino) {
515
516 ExecOutput o;
517 ExecInput i;
518 int r;
519
520 assert(context);
521 assert(params);
522 assert(ident);
523 assert(journal_stream_dev);
524 assert(journal_stream_ino);
525
526 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
527
528 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
529 return -errno;
530
531 return STDOUT_FILENO;
532 }
533
534 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
535 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
536 return -errno;
537
538 return STDERR_FILENO;
539 }
540
541 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
542 o = fixup_output(context->std_output, socket_fd);
543
544 if (fileno == STDERR_FILENO) {
545 ExecOutput e;
546 e = fixup_output(context->std_error, socket_fd);
547
548 /* This expects the input and output are already set up */
549
550 /* Don't change the stderr file descriptor if we inherit all
551 * the way and are not on a tty */
552 if (e == EXEC_OUTPUT_INHERIT &&
553 o == EXEC_OUTPUT_INHERIT &&
554 i == EXEC_INPUT_NULL &&
555 !is_terminal_input(context->std_input) &&
556 getppid() != 1)
557 return fileno;
558
559 /* Duplicate from stdout if possible */
560 if (can_inherit_stderr_from_stdout(context, o, e))
561 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
562
563 o = e;
564
565 } else if (o == EXEC_OUTPUT_INHERIT) {
566 /* If input got downgraded, inherit the original value */
567 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
568 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
569
570 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
571 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
572 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
573
574 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
575 if (getppid() != 1)
576 return fileno;
577
578 /* We need to open /dev/null here anew, to get the right access mode. */
579 return open_null_as(O_WRONLY, fileno);
580 }
581
582 switch (o) {
583
584 case EXEC_OUTPUT_NULL:
585 return open_null_as(O_WRONLY, fileno);
586
587 case EXEC_OUTPUT_TTY:
588 if (is_terminal_input(i))
589 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
590
591 /* We don't reset the terminal if this is just about output */
592 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
593
594 case EXEC_OUTPUT_KMSG:
595 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
596 case EXEC_OUTPUT_JOURNAL:
597 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
598 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
599 if (r < 0) {
600 log_exec_warning_errno(context,
601 params,
602 r,
603 "Failed to connect %s to the journal socket, ignoring: %m",
604 fileno == STDOUT_FILENO ? "stdout" : "stderr");
605 r = open_null_as(O_WRONLY, fileno);
606 } else {
607 struct stat st;
608
609 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
610 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
611 * services to detect whether they are connected to the journal or not.
612 *
613 * If both stdout and stderr are connected to a stream then let's make sure to store the data
614 * about STDERR as that's usually the best way to do logging. */
615
616 if (fstat(fileno, &st) >= 0 &&
617 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
618 *journal_stream_dev = st.st_dev;
619 *journal_stream_ino = st.st_ino;
620 }
621 }
622 return r;
623
624 case EXEC_OUTPUT_SOCKET:
625 assert(socket_fd >= 0);
626
627 return RET_NERRNO(dup2(socket_fd, fileno));
628
629 case EXEC_OUTPUT_NAMED_FD:
630 assert(named_iofds[fileno] >= 0);
631
632 (void) fd_nonblock(named_iofds[fileno], false);
633 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
634
635 case EXEC_OUTPUT_FILE:
636 case EXEC_OUTPUT_FILE_APPEND:
637 case EXEC_OUTPUT_FILE_TRUNCATE: {
638 bool rw;
639 int fd, flags;
640
641 assert(context->stdio_file[fileno]);
642
643 rw = context->std_input == EXEC_INPUT_FILE &&
644 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
645
646 if (rw)
647 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
648
649 flags = O_WRONLY;
650 if (o == EXEC_OUTPUT_FILE_APPEND)
651 flags |= O_APPEND;
652 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
653 flags |= O_TRUNC;
654
655 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
656 if (fd < 0)
657 return fd;
658
659 return move_fd(fd, fileno, 0);
660 }
661
662 default:
663 assert_not_reached();
664 }
665}
666
667static int chown_terminal(int fd, uid_t uid) {
668 int r;
669
670 assert(fd >= 0);
671
672 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
673 if (isatty(fd) < 1) {
674 if (IN_SET(errno, EINVAL, ENOTTY))
675 return 0; /* not a tty */
676
677 return -errno;
678 }
679
680 /* This might fail. What matters are the results. */
681 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
682 if (r < 0)
683 return r;
684
685 return 1;
686}
687
688static int setup_confirm_stdio(
689 const ExecContext *context,
690 const char *vc,
691 int *ret_saved_stdin,
692 int *ret_saved_stdout) {
693
694 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
75689fb2
LB
695 int r;
696
697 assert(ret_saved_stdin);
698 assert(ret_saved_stdout);
699
700 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
701 if (saved_stdin < 0)
702 return -errno;
703
704 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
705 if (saved_stdout < 0)
706 return -errno;
707
708 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
709 if (fd < 0)
710 return fd;
711
712 r = chown_terminal(fd, getuid());
713 if (r < 0)
714 return r;
715
716 r = reset_terminal_fd(fd, true);
717 if (r < 0)
718 return r;
719
d2b9e755 720 r = exec_context_apply_tty_size(context, fd, vc);
75689fb2
LB
721 if (r < 0)
722 return r;
723
724 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
725 TAKE_FD(fd);
726 if (r < 0)
727 return r;
728
729 *ret_saved_stdin = TAKE_FD(saved_stdin);
730 *ret_saved_stdout = TAKE_FD(saved_stdout);
731 return 0;
732}
733
734static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
735 assert(err < 0);
736 assert(unit_id);
737
738 if (err == -ETIMEDOUT)
739 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
740 else {
741 errno = -err;
742 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
743 }
744}
745
746static void write_confirm_error(int err, const char *vc, const char *unit_id) {
747 _cleanup_close_ int fd = -EBADF;
748
749 assert(vc);
750
751 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
752 if (fd < 0)
753 return;
754
755 write_confirm_error_fd(err, fd, unit_id);
756}
757
758static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
759 int r = 0;
760
761 assert(saved_stdin);
762 assert(saved_stdout);
763
764 release_terminal();
765
766 if (*saved_stdin >= 0)
767 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
768 r = -errno;
769
770 if (*saved_stdout >= 0)
771 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
772 r = -errno;
773
774 *saved_stdin = safe_close(*saved_stdin);
775 *saved_stdout = safe_close(*saved_stdout);
776
777 return r;
778}
779
780enum {
781 CONFIRM_PRETEND_FAILURE = -1,
782 CONFIRM_PRETEND_SUCCESS = 0,
783 CONFIRM_EXECUTE = 1,
784};
785
786static bool confirm_spawn_disabled(void) {
787 return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
788}
789
790static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
791 int saved_stdout = -1, saved_stdin = -1, r;
792 _cleanup_free_ char *e = NULL;
793 char c;
794
795 assert(context);
796 assert(params);
797
798 /* For any internal errors, assume a positive response. */
799 r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
800 if (r < 0) {
801 write_confirm_error(r, params->confirm_spawn, params->unit_id);
802 return CONFIRM_EXECUTE;
803 }
804
805 /* confirm_spawn might have been disabled while we were sleeping. */
806 if (!params->confirm_spawn || confirm_spawn_disabled()) {
807 r = 1;
808 goto restore_stdio;
809 }
810
811 e = ellipsize(cmdline, 60, 100);
812 if (!e) {
813 log_oom();
814 r = CONFIRM_EXECUTE;
815 goto restore_stdio;
816 }
817
818 for (;;) {
819 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
820 if (r < 0) {
821 write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
822 r = CONFIRM_EXECUTE;
823 goto restore_stdio;
824 }
825
826 switch (c) {
827 case 'c':
828 printf("Resuming normal execution.\n");
829 manager_disable_confirm_spawn();
830 r = 1;
831 break;
832 case 'D':
833 printf(" Unit: %s\n",
834 params->unit_id);
835 exec_context_dump(context, stdout, " ");
836 exec_params_dump(params, stdout, " ");
837 continue; /* ask again */
838 case 'f':
839 printf("Failing execution.\n");
840 r = CONFIRM_PRETEND_FAILURE;
841 break;
842 case 'h':
843 printf(" c - continue, proceed without asking anymore\n"
844 " D - dump, show the state of the unit\n"
845 " f - fail, don't execute the command and pretend it failed\n"
846 " h - help\n"
847 " i - info, show a short summary of the unit\n"
848 " j - jobs, show jobs that are in progress\n"
849 " s - skip, don't execute the command and pretend it succeeded\n"
850 " y - yes, execute the command\n");
851 continue; /* ask again */
852 case 'i':
853 printf(" Unit: %s\n"
854 " Command: %s\n",
855 params->unit_id, cmdline);
856 continue; /* ask again */
857 case 'j':
858 if (sigqueue(getppid(),
859 SIGRTMIN+18,
860 (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
861 return -errno;
862
863 continue; /* ask again */
864 case 'n':
865 /* 'n' was removed in favor of 'f'. */
866 printf("Didn't understand 'n', did you mean 'f'?\n");
867 continue; /* ask again */
868 case 's':
869 printf("Skipping execution.\n");
870 r = CONFIRM_PRETEND_SUCCESS;
871 break;
872 case 'y':
873 r = CONFIRM_EXECUTE;
874 break;
875 default:
876 assert_not_reached();
877 }
878 break;
879 }
880
881restore_stdio:
882 restore_confirm_stdio(&saved_stdin, &saved_stdout);
883 return r;
884}
885
886static int get_fixed_user(
8d85efae
MY
887 const char *user_or_uid,
888 const char **ret_username,
75689fb2
LB
889 uid_t *ret_uid,
890 gid_t *ret_gid,
891 const char **ret_home,
892 const char **ret_shell) {
893
894 int r;
895
8d85efae
MY
896 assert(user_or_uid);
897 assert(ret_username);
75689fb2
LB
898
899 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
900 * (i.e. are "/" or "/bin/nologin"). */
901
8d85efae 902 r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
75689fb2
LB
903 if (r < 0)
904 return r;
905
8d85efae
MY
906 /* user_or_uid is normalized by get_user_creds to username */
907 *ret_username = user_or_uid;
908
75689fb2
LB
909 return 0;
910}
911
912static int get_fixed_group(
8d85efae
MY
913 const char *group_or_gid,
914 const char **ret_groupname,
75689fb2
LB
915 gid_t *ret_gid) {
916
917 int r;
918
8d85efae
MY
919 assert(group_or_gid);
920 assert(ret_groupname);
75689fb2 921
8d85efae 922 r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
75689fb2
LB
923 if (r < 0)
924 return r;
925
8d85efae
MY
926 /* group_or_gid is normalized by get_group_creds to groupname */
927 *ret_groupname = group_or_gid;
928
75689fb2
LB
929 return 0;
930}
931
932static int get_supplementary_groups(const ExecContext *c, const char *user,
933 const char *group, gid_t gid,
934 gid_t **supplementary_gids, int *ngids) {
935 int r, k = 0;
936 int ngroups_max;
937 bool keep_groups = false;
938 gid_t *groups = NULL;
939 _cleanup_free_ gid_t *l_gids = NULL;
940
941 assert(c);
942
943 /*
944 * If user is given, then lookup GID and supplementary groups list.
945 * We avoid NSS lookups for gid=0. Also we have to initialize groups
946 * here and as early as possible so we keep the list of supplementary
947 * groups of the caller.
948 */
949 if (user && gid_is_valid(gid) && gid != 0) {
950 /* First step, initialize groups from /etc/groups */
951 if (initgroups(user, gid) < 0)
952 return -errno;
953
954 keep_groups = true;
955 }
956
957 if (strv_isempty(c->supplementary_groups))
958 return 0;
959
960 /*
961 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
962 * be positive, otherwise fail.
963 */
964 errno = 0;
965 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
966 if (ngroups_max <= 0)
967 return errno_or_else(EOPNOTSUPP);
968
969 l_gids = new(gid_t, ngroups_max);
970 if (!l_gids)
971 return -ENOMEM;
972
973 if (keep_groups) {
974 /*
975 * Lookup the list of groups that the user belongs to, we
976 * avoid NSS lookups here too for gid=0.
977 */
978 k = ngroups_max;
979 if (getgrouplist(user, gid, l_gids, &k) < 0)
980 return -EINVAL;
981 } else
982 k = 0;
983
984 STRV_FOREACH(i, c->supplementary_groups) {
985 const char *g;
986
987 if (k >= ngroups_max)
988 return -E2BIG;
989
990 g = *i;
991 r = get_group_creds(&g, l_gids+k, 0);
992 if (r < 0)
993 return r;
994
995 k++;
996 }
997
998 /*
999 * Sets ngids to zero to drop all supplementary groups, happens
1000 * when we are under root and SupplementaryGroups= is empty.
1001 */
1002 if (k == 0) {
1003 *ngids = 0;
1004 return 0;
1005 }
1006
1007 /* Otherwise get the final list of supplementary groups */
1008 groups = memdup(l_gids, sizeof(gid_t) * k);
1009 if (!groups)
1010 return -ENOMEM;
1011
1012 *supplementary_gids = groups;
1013 *ngids = k;
1014
1015 groups = NULL;
1016
1017 return 0;
1018}
1019
1020static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1021 int r;
1022
1023 /* Handle SupplementaryGroups= if it is not empty */
1024 if (ngids > 0) {
1025 r = maybe_setgroups(ngids, supplementary_gids);
1026 if (r < 0)
1027 return r;
1028 }
1029
1030 if (gid_is_valid(gid)) {
1031 /* Then set our gids */
1032 if (setresgid(gid, gid, gid) < 0)
1033 return -errno;
1034 }
1035
1036 return 0;
1037}
1038
1039static int set_securebits(unsigned bits, unsigned mask) {
1040 unsigned applied;
1041 int current;
1042
1043 current = prctl(PR_GET_SECUREBITS);
1044 if (current < 0)
1045 return -errno;
1046
1047 /* Clear all securebits defined in mask and set bits */
1048 applied = ((unsigned) current & ~mask) | bits;
1049 if ((unsigned) current == applied)
1050 return 0;
1051
1052 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1053 return -errno;
1054
1055 return 1;
1056}
1057
1058static int enforce_user(
1059 const ExecContext *context,
1060 uid_t uid,
1061 uint64_t capability_ambient_set) {
1062 assert(context);
1063 int r;
1064
1065 if (!uid_is_valid(uid))
1066 return 0;
1067
1068 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1069 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1070 * case. */
1071
1072 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1073
1074 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1075 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1076 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1077 if (r < 0)
1078 return r;
1079 }
1080
1081 /* Second step: actually set the uids */
1082 if (setresuid(uid, uid, uid) < 0)
1083 return -errno;
1084
1085 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1086 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1087 * outside of this call. */
1088 return 0;
1089}
1090
1091#if HAVE_PAM
1092
1093static int null_conv(
1094 int num_msg,
1095 const struct pam_message **msg,
1096 struct pam_response **resp,
1097 void *appdata_ptr) {
1098
1099 /* We don't support conversations */
1100
1101 return PAM_CONV_ERR;
1102}
1103
1104#endif
1105
1106static int setup_pam(
1107 const char *name,
1108 const char *user,
1109 uid_t uid,
1110 gid_t gid,
1111 const char *tty,
1112 char ***env, /* updated on success */
1113 const int fds[], size_t n_fds) {
1114
1115#if HAVE_PAM
1116
1117 static const struct pam_conv conv = {
1118 .conv = null_conv,
1119 .appdata_ptr = NULL
1120 };
1121
1122 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1123 _cleanup_strv_free_ char **e = NULL;
1124 pam_handle_t *handle = NULL;
1125 sigset_t old_ss;
1126 int pam_code = PAM_SUCCESS, r;
1127 bool close_session = false;
1128 pid_t pam_pid = 0, parent_pid;
1129 int flags = 0;
1130
1131 assert(name);
1132 assert(user);
1133 assert(env);
1134
1135 /* We set up PAM in the parent process, then fork. The child
1136 * will then stay around until killed via PR_GET_PDEATHSIG or
1137 * systemd via the cgroup logic. It will then remove the PAM
1138 * session again. The parent process will exec() the actual
1139 * daemon. We do things this way to ensure that the main PID
1140 * of the daemon is the one we initially fork()ed. */
1141
1142 r = barrier_create(&barrier);
1143 if (r < 0)
1144 goto fail;
1145
1146 if (log_get_max_level() < LOG_DEBUG)
1147 flags |= PAM_SILENT;
1148
1149 pam_code = pam_start(name, user, &conv, &handle);
1150 if (pam_code != PAM_SUCCESS) {
1151 handle = NULL;
1152 goto fail;
1153 }
1154
1155 if (!tty) {
1156 _cleanup_free_ char *q = NULL;
1157
1158 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1159 * out if that's the case, and read the TTY off it. */
1160
1161 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1162 tty = strjoina("/dev/", q);
1163 }
1164
1165 if (tty) {
1166 pam_code = pam_set_item(handle, PAM_TTY, tty);
1167 if (pam_code != PAM_SUCCESS)
1168 goto fail;
1169 }
1170
1171 STRV_FOREACH(nv, *env) {
1172 pam_code = pam_putenv(handle, *nv);
1173 if (pam_code != PAM_SUCCESS)
1174 goto fail;
1175 }
1176
1177 pam_code = pam_acct_mgmt(handle, flags);
1178 if (pam_code != PAM_SUCCESS)
1179 goto fail;
1180
1181 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1182 if (pam_code != PAM_SUCCESS)
1183 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1184
1185 pam_code = pam_open_session(handle, flags);
1186 if (pam_code != PAM_SUCCESS)
1187 goto fail;
1188
1189 close_session = true;
1190
1191 e = pam_getenvlist(handle);
1192 if (!e) {
1193 pam_code = PAM_BUF_ERR;
1194 goto fail;
1195 }
1196
1197 /* Block SIGTERM, so that we know that it won't get lost in the child */
1198
1199 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1200
1201 parent_pid = getpid_cached();
1202
1203 r = safe_fork("(sd-pam)", 0, &pam_pid);
1204 if (r < 0)
1205 goto fail;
1206 if (r == 0) {
1207 int sig, ret = EXIT_PAM;
1208
1209 /* The child's job is to reset the PAM session on termination */
1210 barrier_set_role(&barrier, BARRIER_CHILD);
1211
1212 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1213 * those fds are open here that have been opened by PAM. */
1214 (void) close_many(fds, n_fds);
1215
1216 /* Drop privileges - we don't need any to pam_close_session and this will make
1217 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1218 * threads to fail to exit normally */
1219
1220 r = maybe_setgroups(0, NULL);
1221 if (r < 0)
1222 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1223 if (setresgid(gid, gid, gid) < 0)
1224 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1225 if (setresuid(uid, uid, uid) < 0)
1226 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1227
1228 (void) ignore_signals(SIGPIPE);
1229
1230 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1231 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1232 * this way. We rely on the control groups kill logic to do the rest for us. */
1233 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1234 goto child_finish;
1235
1236 /* Tell the parent that our setup is done. This is especially important regarding dropping
1237 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1238 *
1239 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1240 (void) barrier_place(&barrier);
1241
1242 /* Check if our parent process might already have died? */
1243 if (getppid() == parent_pid) {
1244 sigset_t ss;
1245
1246 assert_se(sigemptyset(&ss) >= 0);
1247 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1248
1249 for (;;) {
1250 if (sigwait(&ss, &sig) < 0) {
1251 if (errno == EINTR)
1252 continue;
1253
1254 goto child_finish;
1255 }
1256
1257 assert(sig == SIGTERM);
1258 break;
1259 }
1260 }
1261
1262 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1263 if (pam_code != PAM_SUCCESS)
1264 goto child_finish;
1265
1266 /* If our parent died we'll end the session */
1267 if (getppid() != parent_pid) {
1268 pam_code = pam_close_session(handle, flags);
1269 if (pam_code != PAM_SUCCESS)
1270 goto child_finish;
1271 }
1272
1273 ret = 0;
1274
1275 child_finish:
1276 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1277 * know about this. See pam_end(3) */
1278 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1279 _exit(ret);
1280 }
1281
1282 barrier_set_role(&barrier, BARRIER_PARENT);
1283
1284 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1285 * here. */
1286 handle = NULL;
1287
1288 /* Unblock SIGTERM again in the parent */
1289 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1290
1291 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1292 * this fd around. */
1293 closelog();
1294
1295 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1296 * recover. However, warn loudly if it happens. */
1297 if (!barrier_place_and_sync(&barrier))
1298 log_error("PAM initialization failed");
1299
1300 return strv_free_and_replace(*env, e);
1301
1302fail:
1303 if (pam_code != PAM_SUCCESS) {
1304 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1305 r = -EPERM; /* PAM errors do not map to errno */
1306 } else
1307 log_error_errno(r, "PAM failed: %m");
1308
1309 if (handle) {
1310 if (close_session)
1311 pam_code = pam_close_session(handle, flags);
1312
1313 (void) pam_end(handle, pam_code | flags);
1314 }
1315
1316 closelog();
1317 return r;
1318#else
1319 return 0;
1320#endif
1321}
1322
1323static void rename_process_from_path(const char *path) {
1324 _cleanup_free_ char *buf = NULL;
1325 const char *p;
1326
1327 assert(path);
1328
1329 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1330 * /bin/ps */
1331
1332 if (path_extract_filename(path, &buf) < 0) {
1333 rename_process("(...)");
1334 return;
1335 }
1336
1337 size_t l = strlen(buf);
1338 if (l > 8) {
1339 /* The end of the process name is usually more interesting, since the first bit might just be
1340 * "systemd-" */
1341 p = buf + l - 8;
1342 l = 8;
1343 } else
1344 p = buf;
1345
1346 char process_name[11];
1347 process_name[0] = '(';
1348 memcpy(process_name+1, p, l);
1349 process_name[1+l] = ')';
1350 process_name[1+l+1] = 0;
1351
1352 rename_process(process_name);
1353}
1354
1355static bool context_has_address_families(const ExecContext *c) {
1356 assert(c);
1357
1358 return c->address_families_allow_list ||
1359 !set_isempty(c->address_families);
1360}
1361
1362static bool context_has_syscall_filters(const ExecContext *c) {
1363 assert(c);
1364
1365 return c->syscall_allow_list ||
1366 !hashmap_isempty(c->syscall_filter);
1367}
1368
1369static bool context_has_syscall_logs(const ExecContext *c) {
1370 assert(c);
1371
1372 return c->syscall_log_allow_list ||
1373 !hashmap_isempty(c->syscall_log);
1374}
1375
24832d10 1376static bool context_has_seccomp(const ExecContext *c) {
75689fb2
LB
1377 /* We need NNP if we have any form of seccomp and are unprivileged */
1378 return c->lock_personality ||
1379 c->memory_deny_write_execute ||
1380 c->private_devices ||
1381 c->protect_clock ||
1382 c->protect_hostname ||
1383 c->protect_kernel_tunables ||
1384 c->protect_kernel_modules ||
1385 c->protect_kernel_logs ||
1386 context_has_address_families(c) ||
1387 exec_context_restrict_namespaces_set(c) ||
1388 c->restrict_realtime ||
1389 c->restrict_suid_sgid ||
1390 !set_isempty(c->syscall_archs) ||
1391 context_has_syscall_filters(c) ||
1392 context_has_syscall_logs(c);
1393}
1394
24832d10
ILG
1395static bool context_has_no_new_privileges(const ExecContext *c) {
1396 assert(c);
1397
1398 if (c->no_new_privileges)
1399 return true;
1400
1401 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1402 return false;
1403
1404 return context_has_seccomp(c);
1405}
1406
75689fb2
LB
1407#if HAVE_SECCOMP
1408
24832d10
ILG
1409static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1410 void *id, *val;
1411 bool has_capget = false, has_capset = false, has_prctl = false;
1412
1413 assert(c);
1414
1415 /* No syscall filter, we are allowed to drop privileges */
1416 if (hashmap_isempty(c->syscall_filter))
1417 return true;
1418
1419 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1420 _cleanup_free_ char *name = NULL;
1421
1422 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1423
1424 if (streq(name, "capget"))
1425 has_capget = true;
1426 else if (streq(name, "capset"))
1427 has_capset = true;
1428 else if (streq(name, "prctl"))
1429 has_prctl = true;
1430 }
1431
1432 if (c->syscall_allow_list)
1433 return has_capget && has_capset && has_prctl;
1434 else
1435 return !(has_capget || has_capset || has_prctl);
1436}
1437
75689fb2
LB
1438static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1439
1440 if (is_seccomp_available())
1441 return false;
1442
1443 log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1444 return true;
1445}
1446
1447static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1448 uint32_t negative_action, default_action, action;
1449 int r;
1450
1451 assert(c);
1452 assert(p);
1453
1454 if (!context_has_syscall_filters(c))
1455 return 0;
1456
1457 if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1458 return 0;
1459
1460 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1461
1462 if (c->syscall_allow_list) {
1463 default_action = negative_action;
1464 action = SCMP_ACT_ALLOW;
1465 } else {
1466 default_action = SCMP_ACT_ALLOW;
1467 action = negative_action;
1468 }
1469
1470 if (needs_ambient_hack) {
1471 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1472 if (r < 0)
1473 return r;
1474 }
1475
1476 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1477}
1478
1479static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1480#ifdef SCMP_ACT_LOG
1481 uint32_t default_action, action;
1482#endif
1483
1484 assert(c);
1485 assert(p);
1486
1487 if (!context_has_syscall_logs(c))
1488 return 0;
1489
1490#ifdef SCMP_ACT_LOG
1491 if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1492 return 0;
1493
1494 if (c->syscall_log_allow_list) {
1495 /* Log nothing but the ones listed */
1496 default_action = SCMP_ACT_ALLOW;
1497 action = SCMP_ACT_LOG;
1498 } else {
1499 /* Log everything but the ones listed */
1500 default_action = SCMP_ACT_LOG;
1501 action = SCMP_ACT_ALLOW;
1502 }
1503
1504 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1505#else
1506 /* old libseccomp */
1507 log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1508 return 0;
1509#endif
1510}
1511
1512static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1513 assert(c);
1514 assert(p);
1515
1516 if (set_isempty(c->syscall_archs))
1517 return 0;
1518
1519 if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1520 return 0;
1521
1522 return seccomp_restrict_archs(c->syscall_archs);
1523}
1524
1525static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1526 assert(c);
1527 assert(p);
1528
1529 if (!context_has_address_families(c))
1530 return 0;
1531
1532 if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1533 return 0;
1534
1535 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1536}
1537
1538static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1539 int r;
1540
1541 assert(c);
1542 assert(p);
1543
1544 if (!c->memory_deny_write_execute)
1545 return 0;
1546
1547 /* use prctl() if kernel supports it (6.3) */
1548 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1549 if (r == 0) {
1550 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1551 return 0;
1552 }
1553 if (r < 0 && errno != EINVAL)
1554 return log_exec_debug_errno(c,
1555 p,
1556 errno,
1557 "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1558 /* else use seccomp */
1559 log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1560
1561 if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1562 return 0;
1563
1564 return seccomp_memory_deny_write_execute();
1565}
1566
1567static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1568 assert(c);
1569 assert(p);
1570
1571 if (!c->restrict_realtime)
1572 return 0;
1573
1574 if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1575 return 0;
1576
1577 return seccomp_restrict_realtime();
1578}
1579
1580static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1581 assert(c);
1582 assert(p);
1583
1584 if (!c->restrict_suid_sgid)
1585 return 0;
1586
1587 if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1588 return 0;
1589
1590 return seccomp_restrict_suid_sgid();
1591}
1592
1593static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1594 assert(c);
1595 assert(p);
1596
1597 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1598 * let's protect even those systems where this is left on in the kernel. */
1599
1600 if (!c->protect_kernel_tunables)
1601 return 0;
1602
1603 if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1604 return 0;
1605
1606 return seccomp_protect_sysctl();
1607}
1608
1609static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1610 assert(c);
1611 assert(p);
1612
1613 /* Turn off module syscalls on ProtectKernelModules=yes */
1614
1615 if (!c->protect_kernel_modules)
1616 return 0;
1617
1618 if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1619 return 0;
1620
1621 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1622}
1623
1624static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1625 assert(c);
1626 assert(p);
1627
1628 if (!c->protect_kernel_logs)
1629 return 0;
1630
1631 if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1632 return 0;
1633
1634 return seccomp_protect_syslog();
1635}
1636
1637static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1638 assert(c);
1639 assert(p);
1640
1641 if (!c->protect_clock)
1642 return 0;
1643
1644 if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1645 return 0;
1646
1647 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1648}
1649
1650static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1651 assert(c);
1652 assert(p);
1653
1654 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1655
1656 if (!c->private_devices)
1657 return 0;
1658
1659 if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1660 return 0;
1661
1662 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1663}
1664
1665static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1666 assert(c);
1667 assert(p);
1668
1669 if (!exec_context_restrict_namespaces_set(c))
1670 return 0;
1671
1672 if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1673 return 0;
1674
1675 return seccomp_restrict_namespaces(c->restrict_namespaces);
1676}
1677
1678static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1679 unsigned long personality;
1680 int r;
1681
1682 assert(c);
1683 assert(p);
1684
1685 if (!c->lock_personality)
1686 return 0;
1687
1688 if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1689 return 0;
1690
1691 personality = c->personality;
1692
1693 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1694 if (personality == PERSONALITY_INVALID) {
1695
1696 r = opinionated_personality(&personality);
1697 if (r < 0)
1698 return r;
1699 }
1700
1701 return seccomp_lock_personality(personality);
1702}
1703
1704#endif
1705
1706#if HAVE_LIBBPF
1707static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1708 int r;
1709
1710 assert(c);
1711 assert(p);
1712
1713 if (!exec_context_restrict_filesystems_set(c))
1714 return 0;
1715
1716 if (p->bpf_outer_map_fd < 0) {
1717 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1718 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1719 return 0;
1720 }
1721
1722 /* We are in a new binary, so dl-open again */
1723 r = dlopen_bpf();
1724 if (r < 0)
1725 return r;
1726
1727 return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
1728}
1729#endif
1730
1731static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1732 assert(c);
1733 assert(p);
1734
1735 if (!c->protect_hostname)
1736 return 0;
1737
1738 if (ns_type_supported(NAMESPACE_UTS)) {
1739 if (unshare(CLONE_NEWUTS) < 0) {
1740 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1741 *ret_exit_status = EXIT_NAMESPACE;
1742 return log_exec_error_errno(c,
1743 p,
1744 errno,
1745 "Failed to set up UTS namespacing: %m");
1746 }
1747
1748 log_exec_warning(c,
1749 p,
1750 "ProtectHostname=yes is configured, but UTS namespace setup is "
1751 "prohibited (container manager?), ignoring namespace setup.");
1752 }
1753 } else
1754 log_exec_warning(c,
1755 p,
1756 "ProtectHostname=yes is configured, but the kernel does not "
1757 "support UTS namespaces, ignoring namespace setup.");
1758
1759#if HAVE_SECCOMP
1760 int r;
1761
1762 if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1763 return 0;
1764
1765 r = seccomp_protect_hostname();
1766 if (r < 0) {
1767 *ret_exit_status = EXIT_SECCOMP;
1768 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1769 }
1770#endif
1771
1772 return 0;
1773}
1774
1775static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1776 assert(idle_pipe);
1777
1778 idle_pipe[1] = safe_close(idle_pipe[1]);
1779 idle_pipe[2] = safe_close(idle_pipe[2]);
1780
1781 if (idle_pipe[0] >= 0) {
1782 int r;
1783
1784 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1785
1786 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1787 ssize_t n;
1788
1789 /* Signal systemd that we are bored and want to continue. */
1790 n = write(idle_pipe[3], "x", 1);
1791 if (n > 0)
1792 /* Wait for systemd to react to the signal above. */
1793 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1794 }
1795
1796 idle_pipe[0] = safe_close(idle_pipe[0]);
1797
1798 }
1799
1800 idle_pipe[3] = safe_close(idle_pipe[3]);
1801}
1802
1803static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1804
1805/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1806 * the service payload in. */
1807static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1808 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1809 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1810 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1811 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1812 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1813};
1814
1815DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1816
1817static int build_environment(
1818 const ExecContext *c,
1819 const ExecParameters *p,
1820 const CGroupContext *cgroup_context,
1821 size_t n_fds,
1822 char **fdnames,
1823 const char *home,
1824 const char *username,
1825 const char *shell,
1826 dev_t journal_stream_dev,
1827 ino_t journal_stream_ino,
1828 const char *memory_pressure_path,
1829 char ***ret) {
1830
1831 _cleanup_strv_free_ char **our_env = NULL;
1832 size_t n_env = 0;
1833 char *x;
1834 int r;
1835
1836 assert(c);
1837 assert(p);
1838 assert(ret);
1839
1840#define N_ENV_VARS 19
1841 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1842 if (!our_env)
1843 return -ENOMEM;
1844
1845 if (n_fds > 0) {
1846 _cleanup_free_ char *joined = NULL;
1847
1848 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1849 return -ENOMEM;
1850 our_env[n_env++] = x;
1851
1852 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1853 return -ENOMEM;
1854 our_env[n_env++] = x;
1855
1856 joined = strv_join(fdnames, ":");
1857 if (!joined)
1858 return -ENOMEM;
1859
1860 x = strjoin("LISTEN_FDNAMES=", joined);
1861 if (!x)
1862 return -ENOMEM;
1863 our_env[n_env++] = x;
1864 }
1865
1866 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1867 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1868 return -ENOMEM;
1869 our_env[n_env++] = x;
1870
1871 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1872 return -ENOMEM;
1873 our_env[n_env++] = x;
1874 }
1875
1876 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1877 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1878 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1879 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1880 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1881 if (!x)
1882 return -ENOMEM;
1883 our_env[n_env++] = x;
1884 }
1885
1886 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1887 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1888 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1889 * SetLoginEnvironment= switch. */
1890 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1891 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1892 if (r < 0)
1893 return log_exec_debug_errno(c,
1894 p,
1895 r,
1896 "Failed to determine user credentials for root: %m");
1897 }
1898
1899 bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
1900
1901 if (username) {
1902 x = strjoin("USER=", username);
1903 if (!x)
1904 return -ENOMEM;
1905 our_env[n_env++] = x;
1906
1907 if (set_user_login_env) {
1908 x = strjoin("LOGNAME=", username);
1909 if (!x)
1910 return -ENOMEM;
1911 our_env[n_env++] = x;
1912 }
1913 }
1914
1915 if (home && set_user_login_env) {
1916 x = strjoin("HOME=", home);
1917 if (!x)
1918 return -ENOMEM;
1919
1920 path_simplify(x + 5);
1921 our_env[n_env++] = x;
1922 }
1923
1924 if (shell && set_user_login_env) {
1925 x = strjoin("SHELL=", shell);
1926 if (!x)
1927 return -ENOMEM;
1928
1929 path_simplify(x + 6);
1930 our_env[n_env++] = x;
1931 }
1932
1933 if (!sd_id128_is_null(p->invocation_id)) {
1934 assert(p->invocation_id_string);
1935
1936 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1937 if (!x)
1938 return -ENOMEM;
1939
1940 our_env[n_env++] = x;
1941 }
1942
1943 if (exec_context_needs_term(c)) {
1944 _cleanup_free_ char *cmdline = NULL;
1945 const char *tty_path, *term = NULL;
1946
1947 tty_path = exec_context_tty_path(c);
1948
1949 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1950 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1951 * container manager passes to PID 1 ends up all the way in the console login shown. */
1952
1953 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1954 term = getenv("TERM");
1955 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1956 _cleanup_free_ char *key = NULL;
1957
1958 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1959 if (!key)
1960 return -ENOMEM;
1961
1962 r = proc_cmdline_get_key(key, 0, &cmdline);
1963 if (r < 0)
1964 log_exec_debug_errno(c,
1965 p,
1966 r,
1967 "Failed to read %s from kernel cmdline, ignoring: %m",
1968 key);
1969 else if (r > 0)
1970 term = cmdline;
1971 }
1972
1973 if (!term)
1974 term = default_term_for_tty(tty_path);
1975
1976 x = strjoin("TERM=", term);
1977 if (!x)
1978 return -ENOMEM;
1979 our_env[n_env++] = x;
1980 }
1981
1982 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1983 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1984 return -ENOMEM;
1985
1986 our_env[n_env++] = x;
1987 }
1988
1989 if (c->log_namespace) {
1990 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1991 if (!x)
1992 return -ENOMEM;
1993
1994 our_env[n_env++] = x;
1995 }
1996
1997 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1998 _cleanup_free_ char *joined = NULL;
1999 const char *n;
2000
2001 if (!p->prefix[t])
2002 continue;
2003
2004 if (c->directories[t].n_items == 0)
2005 continue;
2006
2007 n = exec_directory_env_name_to_string(t);
2008 if (!n)
2009 continue;
2010
2011 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2012 _cleanup_free_ char *prefixed = NULL;
2013
2014 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2015 if (!prefixed)
2016 return -ENOMEM;
2017
2018 if (!strextend_with_separator(&joined, ":", prefixed))
2019 return -ENOMEM;
2020 }
2021
2022 x = strjoin(n, "=", joined);
2023 if (!x)
2024 return -ENOMEM;
2025
2026 our_env[n_env++] = x;
2027 }
2028
2029 _cleanup_free_ char *creds_dir = NULL;
2030 r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
2031 if (r < 0)
2032 return r;
2033 if (r > 0) {
2034 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2035 if (!x)
2036 return -ENOMEM;
2037
2038 our_env[n_env++] = x;
2039 }
2040
2041 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2042 return -ENOMEM;
2043
2044 our_env[n_env++] = x;
2045
2046 if (memory_pressure_path) {
2047 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2048 if (!x)
2049 return -ENOMEM;
2050
2051 our_env[n_env++] = x;
2052
2053 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2054 _cleanup_free_ char *b = NULL, *e = NULL;
2055
2056 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2057 MEMORY_PRESSURE_DEFAULT_TYPE,
2058 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2059 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2060 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2061 return -ENOMEM;
2062
2063 if (base64mem(b, strlen(b) + 1, &e) < 0)
2064 return -ENOMEM;
2065
2066 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2067 if (!x)
2068 return -ENOMEM;
2069
2070 our_env[n_env++] = x;
2071 }
2072 }
2073
2074 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2075#undef N_ENV_VARS
2076
2077 *ret = TAKE_PTR(our_env);
2078
2079 return 0;
2080}
2081
2082static int build_pass_environment(const ExecContext *c, char ***ret) {
2083 _cleanup_strv_free_ char **pass_env = NULL;
2084 size_t n_env = 0;
2085
2086 STRV_FOREACH(i, c->pass_environment) {
2087 _cleanup_free_ char *x = NULL;
2088 char *v;
2089
2090 v = getenv(*i);
2091 if (!v)
2092 continue;
2093 x = strjoin(*i, "=", v);
2094 if (!x)
2095 return -ENOMEM;
2096
2097 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2098 return -ENOMEM;
2099
2100 pass_env[n_env++] = TAKE_PTR(x);
2101 pass_env[n_env] = NULL;
2102 }
2103
2104 *ret = TAKE_PTR(pass_env);
2105
2106 return 0;
2107}
2108
2109static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2110 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
71136404 2111 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
75689fb2
LB
2112 _cleanup_close_ int unshare_ready_fd = -EBADF;
2113 _cleanup_(sigkill_waitp) pid_t pid = 0;
2114 uint64_t c = 1;
2115 ssize_t n;
2116 int r;
2117
2118 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2119 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2120 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2121 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2122 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2123 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2124 * continues execution normally.
2125 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2126 * does not need CAP_SETUID to write the single line mapping to itself. */
2127
2128 /* Can only set up multiple mappings with CAP_SETUID. */
2129 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2130 r = asprintf(&uid_map,
2131 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2132 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2133 ouid, ouid, uid, uid);
2134 else
2135 r = asprintf(&uid_map,
2136 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2137 ouid, ouid);
2138
2139 if (r < 0)
2140 return -ENOMEM;
2141
2142 /* Can only set up multiple mappings with CAP_SETGID. */
2143 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2144 r = asprintf(&gid_map,
2145 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2146 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2147 ogid, ogid, gid, gid);
2148 else
2149 r = asprintf(&gid_map,
2150 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2151 ogid, ogid);
2152
2153 if (r < 0)
2154 return -ENOMEM;
2155
2156 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2157 * namespace. */
2158 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2159 if (unshare_ready_fd < 0)
2160 return -errno;
2161
2162 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2163 * failed. */
2164 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2165 return -errno;
2166
e9ccae31 2167 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
75689fb2
LB
2168 if (r < 0)
2169 return r;
2170 if (r == 0) {
2171 _cleanup_close_ int fd = -EBADF;
2172 const char *a;
2173 pid_t ppid;
2174
2175 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2176 * here, after the parent opened its own user namespace. */
2177
2178 ppid = getppid();
2179 errno_pipe[0] = safe_close(errno_pipe[0]);
2180
2181 /* Wait until the parent unshared the user namespace */
2182 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2183 r = -errno;
2184 goto child_fail;
2185 }
2186
2187 /* Disable the setgroups() system call in the child user namespace, for good. */
2188 a = procfs_file_alloca(ppid, "setgroups");
2189 fd = open(a, O_WRONLY|O_CLOEXEC);
2190 if (fd < 0) {
2191 if (errno != ENOENT) {
2192 r = -errno;
2193 goto child_fail;
2194 }
2195
2196 /* If the file is missing the kernel is too old, let's continue anyway. */
2197 } else {
2198 if (write(fd, "deny\n", 5) < 0) {
2199 r = -errno;
2200 goto child_fail;
2201 }
2202
2203 fd = safe_close(fd);
2204 }
2205
2206 /* First write the GID map */
2207 a = procfs_file_alloca(ppid, "gid_map");
2208 fd = open(a, O_WRONLY|O_CLOEXEC);
2209 if (fd < 0) {
2210 r = -errno;
2211 goto child_fail;
2212 }
2213 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2214 r = -errno;
2215 goto child_fail;
2216 }
2217 fd = safe_close(fd);
2218
2219 /* The write the UID map */
2220 a = procfs_file_alloca(ppid, "uid_map");
2221 fd = open(a, O_WRONLY|O_CLOEXEC);
2222 if (fd < 0) {
2223 r = -errno;
2224 goto child_fail;
2225 }
2226 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2227 r = -errno;
2228 goto child_fail;
2229 }
2230
2231 _exit(EXIT_SUCCESS);
2232
2233 child_fail:
2234 (void) write(errno_pipe[1], &r, sizeof(r));
2235 _exit(EXIT_FAILURE);
2236 }
2237
2238 errno_pipe[1] = safe_close(errno_pipe[1]);
2239
2240 if (unshare(CLONE_NEWUSER) < 0)
2241 return -errno;
2242
2243 /* Let the child know that the namespace is ready now */
2244 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2245 return -errno;
2246
2247 /* Try to read an error code from the child */
2248 n = read(errno_pipe[0], &r, sizeof(r));
2249 if (n < 0)
2250 return -errno;
2251 if (n == sizeof(r)) { /* an error code was sent to us */
2252 if (r < 0)
2253 return r;
2254 return -EIO;
2255 }
2256 if (n != 0) /* on success we should have read 0 bytes */
2257 return -EIO;
2258
2259 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2260 if (r < 0)
2261 return r;
2262 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2263 return -EIO;
2264
2265 return 0;
2266}
2267
2268static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2269 _cleanup_free_ char *src_abs = NULL;
2270 int r;
2271
2272 assert(source);
2273
2274 src_abs = path_join(root, source);
2275 if (!src_abs)
2276 return -ENOMEM;
2277
2278 STRV_FOREACH(dst, symlinks) {
2279 _cleanup_free_ char *dst_abs = NULL;
2280
2281 dst_abs = path_join(root, *dst);
2282 if (!dst_abs)
2283 return -ENOMEM;
2284
2285 r = mkdir_parents_label(dst_abs, 0755);
2286 if (r < 0)
2287 return r;
2288
2289 r = symlink_idempotent(src_abs, dst_abs, true);
2290 if (r < 0)
2291 return r;
2292 }
2293
2294 return 0;
2295}
2296
2297static int setup_exec_directory(
2298 const ExecContext *context,
2299 const ExecParameters *params,
2300 uid_t uid,
2301 gid_t gid,
2302 ExecDirectoryType type,
2303 bool needs_mount_namespace,
2304 int *exit_status) {
2305
2306 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2307 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2308 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2309 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2310 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2311 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2312 };
2313 int r;
2314
2315 assert(context);
2316 assert(params);
2317 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2318 assert(exit_status);
2319
2320 if (!params->prefix[type])
2321 return 0;
2322
2323 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2324 if (!uid_is_valid(uid))
2325 uid = 0;
2326 if (!gid_is_valid(gid))
2327 gid = 0;
2328 }
2329
2330 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2331 _cleanup_free_ char *p = NULL, *pp = NULL;
2332
2333 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2334 if (!p) {
2335 r = -ENOMEM;
2336 goto fail;
2337 }
2338
2339 r = mkdir_parents_label(p, 0755);
2340 if (r < 0)
2341 goto fail;
2342
2343 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2344
2345 /* If we are in user mode, and a configuration directory exists but a state directory
2346 * doesn't exist, then we likely are upgrading from an older systemd version that
2347 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2348 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2349 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2350 * separated. If a service has both dirs configured but only the configuration dir
2351 * exists and the state dir does not, we assume we are looking at an update
2352 * situation. Hence, create a compatibility symlink, so that all expectations are
2353 * met.
2354 *
2355 * (We also do something similar with the log directory, which still doesn't exist in
2356 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2357
2358 /* this assumes the state dir is always created before the configuration dir */
2359 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2360 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2361
2362 r = laccess(p, F_OK);
2363 if (r == -ENOENT) {
2364 _cleanup_free_ char *q = NULL;
2365
2366 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2367 * under the configuration hierarchy. */
2368
2369 if (type == EXEC_DIRECTORY_STATE)
2370 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2371 else if (type == EXEC_DIRECTORY_LOGS)
2372 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2373 else
2374 assert_not_reached();
2375 if (!q) {
2376 r = -ENOMEM;
2377 goto fail;
2378 }
2379
2380 r = laccess(q, F_OK);
2381 if (r >= 0) {
2382 /* It does exist! This hence looks like an update. Symlink the
2383 * configuration directory into the state directory. */
2384
2385 r = symlink_idempotent(q, p, /* make_relative= */ true);
2386 if (r < 0)
2387 goto fail;
2388
2389 log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2390 continue;
2391 } else if (r != -ENOENT)
2392 log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2393
2394 } else if (r < 0)
2395 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2396 }
2397
2398 if (exec_directory_is_private(context, type)) {
2399 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2400 * case we want to avoid leaving a directory around fully accessible that is owned by
2401 * a dynamic user whose UID is later on reused. To lock this down we use the same
2402 * trick used by container managers to prohibit host users to get access to files of
2403 * the same UID in containers: we place everything inside a directory that has an
2404 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2405 * for unprivileged host code. We then use fs namespacing to make this directory
2406 * permeable for the service itself.
2407 *
2408 * Specifically: for a service which wants a special directory "foo/" we first create
2409 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2410 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2411 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2412 * unprivileged host users can't look into it. Inside of the namespace of the unit
2413 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2414 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2415 * for the service and making sure it only gets access to the dirs it needs but no
2416 * others. Tricky? Yes, absolutely, but it works!
2417 *
2418 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2419 * to be owned by the service itself.
2420 *
2421 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2422 * for sharing files or sockets with other services. */
2423
2424 pp = path_join(params->prefix[type], "private");
2425 if (!pp) {
2426 r = -ENOMEM;
2427 goto fail;
2428 }
2429
2430 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2431 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2432 if (r < 0)
2433 goto fail;
2434
2435 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2436 r = -ENOMEM;
2437 goto fail;
2438 }
2439
2440 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2441 r = mkdir_parents_label(pp, 0755);
2442 if (r < 0)
2443 goto fail;
2444
2445 if (is_dir(p, false) > 0 &&
2446 (laccess(pp, F_OK) == -ENOENT)) {
2447
2448 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2449 * it over. Most likely the service has been upgraded from one that didn't use
2450 * DynamicUser=1, to one that does. */
2451
2452 log_exec_info(context,
2453 params,
2454 "Found pre-existing public %s= directory %s, migrating to %s.\n"
2455 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2456 exec_directory_type_to_string(type), p, pp);
2457
2458 r = RET_NERRNO(rename(p, pp));
2459 if (r < 0)
2460 goto fail;
2461 } else {
2462 /* Otherwise, create the actual directory for the service */
2463
2464 r = mkdir_label(pp, context->directories[type].mode);
2465 if (r < 0 && r != -EEXIST)
2466 goto fail;
2467 }
2468
2469 if (!context->directories[type].items[i].only_create) {
2470 /* And link it up from the original place.
2471 * Notes
2472 * 1) If a mount namespace is going to be used, then this symlink remains on
2473 * the host, and a new one for the child namespace will be created later.
2474 * 2) It is not necessary to create this symlink when one of its parent
2475 * directories is specified and already created. E.g.
2476 * StateDirectory=foo foo/bar
2477 * In that case, the inode points to pp and p for "foo/bar" are the same:
2478 * pp = "/var/lib/private/foo/bar"
2479 * p = "/var/lib/foo/bar"
2480 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2481 * we do not need to create the symlink, but we cannot create the symlink.
2482 * See issue #24783. */
2483 r = symlink_idempotent(pp, p, true);
2484 if (r < 0)
2485 goto fail;
2486 }
2487
2488 } else {
2489 _cleanup_free_ char *target = NULL;
2490
2491 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2492 readlink_and_make_absolute(p, &target) >= 0) {
2493 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2494
2495 /* This already exists and is a symlink? Interesting. Maybe it's one created
2496 * by DynamicUser=1 (see above)?
2497 *
2498 * We do this for all directory types except for ConfigurationDirectory=,
2499 * since they all support the private/ symlink logic at least in some
2500 * configurations, see above. */
2501
2502 r = chase(target, NULL, 0, &target_resolved, NULL);
2503 if (r < 0)
2504 goto fail;
2505
2506 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2507 if (!q) {
2508 r = -ENOMEM;
2509 goto fail;
2510 }
2511
2512 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2513 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2514 if (r < 0)
2515 goto fail;
2516
2517 if (path_equal(q_resolved, target_resolved)) {
2518
2519 /* Hmm, apparently DynamicUser= was once turned on for this service,
2520 * but is no longer. Let's move the directory back up. */
2521
2522 log_exec_info(context,
2523 params,
2524 "Found pre-existing private %s= directory %s, migrating to %s.\n"
2525 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2526 exec_directory_type_to_string(type), q, p);
2527
2528 r = RET_NERRNO(unlink(p));
2529 if (r < 0)
2530 goto fail;
2531
2532 r = RET_NERRNO(rename(q, p));
2533 if (r < 0)
2534 goto fail;
2535 }
2536 }
2537
2538 r = mkdir_label(p, context->directories[type].mode);
2539 if (r < 0) {
2540 if (r != -EEXIST)
2541 goto fail;
2542
2543 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2544 struct stat st;
2545
2546 /* Don't change the owner/access mode of the configuration directory,
2547 * as in the common case it is not written to by a service, and shall
2548 * not be writable. */
2549
2550 r = RET_NERRNO(stat(p, &st));
2551 if (r < 0)
2552 goto fail;
2553
2554 /* Still complain if the access mode doesn't match */
2555 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2556 log_exec_warning(context,
2557 params,
2558 "%s \'%s\' already exists but the mode is different. "
2559 "(File system: %o %sMode: %o)",
2560 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2561 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2562
2563 continue;
2564 }
2565 }
2566 }
2567
2568 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2569 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2570 * current UID/GID ownership.) */
2571 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2572 if (r < 0)
2573 goto fail;
2574
2575 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2576 * available to user code anyway */
2577 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2578 continue;
2579
2580 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2581 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2582 * assignments to exist. */
2583 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2584 if (r < 0)
2585 goto fail;
2586 }
2587
2588 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2589 * they are set up later, to allow configuring empty var/run/etc. */
2590 if (!needs_mount_namespace)
2591 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2592 r = create_many_symlinks(params->prefix[type],
2593 context->directories[type].items[i].path,
2594 context->directories[type].items[i].symlinks);
2595 if (r < 0)
2596 goto fail;
2597 }
2598
2599 return 0;
2600
2601fail:
2602 *exit_status = exit_status_table[type];
2603 return r;
2604}
2605
2606#if ENABLE_SMACK
2607static int setup_smack(
2608 const ExecParameters *params,
2609 const ExecContext *context,
2610 int executable_fd) {
2611 int r;
2612
2613 assert(params);
2614 assert(executable_fd >= 0);
2615
2616 if (context->smack_process_label) {
2617 r = mac_smack_apply_pid(0, context->smack_process_label);
2618 if (r < 0)
2619 return r;
2620 } else if (params->fallback_smack_process_label) {
2621 _cleanup_free_ char *exec_label = NULL;
2622
2623 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2624 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2625 return r;
2626
2627 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2628 if (r < 0)
2629 return r;
2630 }
2631
2632 return 0;
2633}
2634#endif
2635
2636static int compile_bind_mounts(
2637 const ExecContext *context,
2638 const ExecParameters *params,
2639 BindMount **ret_bind_mounts,
2640 size_t *ret_n_bind_mounts,
2641 char ***ret_empty_directories) {
2642
2643 _cleanup_strv_free_ char **empty_directories = NULL;
2644 BindMount *bind_mounts = NULL;
2645 size_t n, h = 0;
2646 int r;
2647
2648 assert(context);
2649 assert(params);
2650 assert(ret_bind_mounts);
2651 assert(ret_n_bind_mounts);
2652 assert(ret_empty_directories);
2653
2654 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2655
2656 n = context->n_bind_mounts;
2657 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2658 if (!params->prefix[t])
2659 continue;
2660
2661 for (size_t i = 0; i < context->directories[t].n_items; i++)
2662 n += !context->directories[t].items[i].only_create;
2663 }
2664
2665 if (n <= 0) {
2666 *ret_bind_mounts = NULL;
2667 *ret_n_bind_mounts = 0;
2668 *ret_empty_directories = NULL;
2669 return 0;
2670 }
2671
2672 bind_mounts = new(BindMount, n);
2673 if (!bind_mounts)
2674 return -ENOMEM;
2675
2676 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2677 BindMount *item = context->bind_mounts + i;
2678 _cleanup_free_ char *s = NULL, *d = NULL;
2679
2680 s = strdup(item->source);
2681 if (!s)
2682 return -ENOMEM;
2683
2684 d = strdup(item->destination);
2685 if (!d)
2686 return -ENOMEM;
2687
2688 bind_mounts[h++] = (BindMount) {
2689 .source = TAKE_PTR(s),
2690 .destination = TAKE_PTR(d),
2691 .read_only = item->read_only,
2692 .recursive = item->recursive,
2693 .ignore_enoent = item->ignore_enoent,
2694 };
2695 }
2696
2697 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2698 if (!params->prefix[t])
2699 continue;
2700
2701 if (context->directories[t].n_items == 0)
2702 continue;
2703
2704 if (exec_directory_is_private(context, t) &&
2705 !exec_context_with_rootfs(context)) {
2706 char *private_root;
2707
2708 /* So this is for a dynamic user, and we need to make sure the process can access its own
2709 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2710 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2711
2712 private_root = path_join(params->prefix[t], "private");
2713 if (!private_root)
2714 return -ENOMEM;
2715
2716 r = strv_consume(&empty_directories, private_root);
2717 if (r < 0)
2718 return r;
2719 }
2720
2721 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2722 _cleanup_free_ char *s = NULL, *d = NULL;
2723
2724 /* When one of the parent directories is in the list, we cannot create the symlink
2725 * for the child directory. See also the comments in setup_exec_directory(). */
2726 if (context->directories[t].items[i].only_create)
2727 continue;
2728
2729 if (exec_directory_is_private(context, t))
2730 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2731 else
2732 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2733 if (!s)
2734 return -ENOMEM;
2735
2736 if (exec_directory_is_private(context, t) &&
2737 exec_context_with_rootfs(context))
2738 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2739 * directory is not created on the root directory. So, let's bind-mount the directory
2740 * on the 'non-private' place. */
2741 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2742 else
2743 d = strdup(s);
2744 if (!d)
2745 return -ENOMEM;
2746
2747 bind_mounts[h++] = (BindMount) {
2748 .source = TAKE_PTR(s),
2749 .destination = TAKE_PTR(d),
2750 .read_only = false,
2751 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2752 .recursive = true,
2753 .ignore_enoent = false,
2754 };
2755 }
2756 }
2757
2758 assert(h == n);
2759
2760 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2761 *ret_n_bind_mounts = n;
2762 *ret_empty_directories = TAKE_PTR(empty_directories);
2763
2764 return (int) n;
2765}
2766
2767/* ret_symlinks will contain a list of pairs src:dest that describes
2768 * the symlinks to create later on. For example, the symlinks needed
2769 * to safely give private directories to DynamicUser=1 users. */
2770static int compile_symlinks(
2771 const ExecContext *context,
2772 const ExecParameters *params,
2773 bool setup_os_release_symlink,
2774 char ***ret_symlinks) {
2775
2776 _cleanup_strv_free_ char **symlinks = NULL;
2777 int r;
2778
2779 assert(context);
2780 assert(params);
2781 assert(ret_symlinks);
2782
2783 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2784 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2785 _cleanup_free_ char *private_path = NULL, *path = NULL;
2786
2787 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2788 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2789
2790 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2791 dst_abs = path_join(params->prefix[dt], *symlink);
2792 if (!src_abs || !dst_abs)
2793 return -ENOMEM;
2794
2795 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2796 if (r < 0)
2797 return r;
2798 }
2799
2800 if (!exec_directory_is_private(context, dt) ||
2801 exec_context_with_rootfs(context) ||
2802 context->directories[dt].items[i].only_create)
2803 continue;
2804
2805 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2806 if (!private_path)
2807 return -ENOMEM;
2808
2809 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2810 if (!path)
2811 return -ENOMEM;
2812
2813 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2814 if (r < 0)
2815 return r;
2816 }
2817 }
2818
2819 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2820 * and readers will never get a half-written version. Note that, while the paths specified here are
2821 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2822 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2823 if (setup_os_release_symlink) {
2824 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2825 if (r < 0)
2826 return r;
2827
2828 r = strv_extend(&symlinks, "/run/host/os-release");
2829 if (r < 0)
2830 return r;
2831 }
2832
2833 *ret_symlinks = TAKE_PTR(symlinks);
2834
2835 return 0;
2836}
2837
2838static bool insist_on_sandboxing(
2839 const ExecContext *context,
2840 const char *root_dir,
2841 const char *root_image,
2842 const BindMount *bind_mounts,
2843 size_t n_bind_mounts) {
2844
2845 assert(context);
2846 assert(n_bind_mounts == 0 || bind_mounts);
2847
2848 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2849 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2850 * rearrange stuff in a way we cannot ignore gracefully. */
2851
2852 if (context->n_temporary_filesystems > 0)
2853 return true;
2854
2855 if (root_dir || root_image)
2856 return true;
2857
2858 if (context->n_mount_images > 0)
2859 return true;
2860
2861 if (context->dynamic_user)
2862 return true;
2863
2864 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2865 return true;
2866
2867 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2868 * essential. */
2869 for (size_t i = 0; i < n_bind_mounts; i++)
2870 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2871 return true;
2872
2873 if (context->log_namespace)
2874 return true;
2875
2876 return false;
2877}
2878
2879static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
2880 _cleanup_close_ int fd = -EBADF;
2881 int r;
2882
2883 if (!runtime || !runtime->ephemeral_copy)
2884 return 0;
2885
2886 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2887 if (r < 0)
2888 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2889
2890 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2891
2892 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2893 if (fd >= 0)
2894 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2895 return 0;
2896
2897 if (fd != -EAGAIN)
2898 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2899
2900 log_debug("Making ephemeral snapshot of %s to %s",
2901 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
2902
2903 if (context->root_image)
2904 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
2905 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
2906 else
2907 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
2908 AT_FDCWD, runtime->ephemeral_copy,
2909 BTRFS_SNAPSHOT_FALLBACK_COPY |
2910 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2911 BTRFS_SNAPSHOT_RECURSIVE |
2912 BTRFS_SNAPSHOT_LOCK_BSD);
2913 if (fd < 0)
2914 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
2915 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
2916
2917 if (context->root_image) {
2918 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2919 * which tends to not perform well in combination with lots of random writes.
2920 *
2921 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2922 * copy, but we at least want to make the intention clear.
2923 */
2924 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2925 if (r < 0)
2926 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
2927 }
2928
2929 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2930 if (r < 0)
2931 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2932
2933 return 1;
2934}
2935
2936static int verity_settings_prepare(
2937 VeritySettings *verity,
2938 const char *root_image,
2939 const void *root_hash,
2940 size_t root_hash_size,
2941 const char *root_hash_path,
2942 const void *root_hash_sig,
2943 size_t root_hash_sig_size,
2944 const char *root_hash_sig_path,
2945 const char *verity_data_path) {
2946
2947 int r;
2948
2949 assert(verity);
2950
2951 if (root_hash) {
2952 void *d;
2953
2954 d = memdup(root_hash, root_hash_size);
2955 if (!d)
2956 return -ENOMEM;
2957
2958 free_and_replace(verity->root_hash, d);
2959 verity->root_hash_size = root_hash_size;
2960 verity->designator = PARTITION_ROOT;
2961 }
2962
2963 if (root_hash_sig) {
2964 void *d;
2965
2966 d = memdup(root_hash_sig, root_hash_sig_size);
2967 if (!d)
2968 return -ENOMEM;
2969
2970 free_and_replace(verity->root_hash_sig, d);
2971 verity->root_hash_sig_size = root_hash_sig_size;
2972 verity->designator = PARTITION_ROOT;
2973 }
2974
2975 if (verity_data_path) {
2976 r = free_and_strdup(&verity->data_path, verity_data_path);
2977 if (r < 0)
2978 return r;
2979 }
2980
2981 r = verity_settings_load(
2982 verity,
2983 root_image,
2984 root_hash_path,
2985 root_hash_sig_path);
2986 if (r < 0)
2987 return log_debug_errno(r, "Failed to load root hash: %m");
2988
2989 return 0;
2990}
2991
2992static int apply_mount_namespace(
2993 ExecCommandFlags command_flags,
2994 const ExecContext *context,
2995 const ExecParameters *params,
2996 ExecRuntime *runtime,
2997 const char *memory_pressure_path,
2998 char **error_path) {
2999
3000 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3001 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3002 **read_write_paths_cleanup = NULL;
3003 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3004 *extension_dir = NULL, *host_os_release_stage = NULL;
3005 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3006 char **read_write_paths;
3007 bool needs_sandboxing, setup_os_release_symlink;
3008 BindMount *bind_mounts = NULL;
3009 size_t n_bind_mounts = 0;
3010 int r;
3011
3012 assert(context);
3013
3014 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3015
3016 if (params->flags & EXEC_APPLY_CHROOT) {
3017 r = setup_ephemeral(context, runtime);
3018 if (r < 0)
3019 return r;
3020
3021 if (context->root_image)
3022 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3023 else
3024 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3025 }
3026
3027 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3028 if (r < 0)
3029 return r;
3030
3031 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3032 * service will need to write to it in order to start the notifications. */
3033 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3034 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3035 if (!read_write_paths_cleanup)
3036 return -ENOMEM;
3037
3038 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3039 if (r < 0)
3040 return r;
3041
3042 read_write_paths = read_write_paths_cleanup;
3043 } else
3044 read_write_paths = context->read_write_paths;
3045
3046 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3047 if (needs_sandboxing) {
3048 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3049 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3050 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3051
3052 if (context->private_tmp && runtime && runtime->shared) {
3053 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3054 tmp_dir = runtime->shared->tmp_dir;
3055 else if (runtime->shared->tmp_dir)
3056 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3057
3058 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3059 var_tmp_dir = runtime->shared->var_tmp_dir;
3060 else if (runtime->shared->var_tmp_dir)
3061 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3062 }
3063 }
3064
3065 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3066 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3067 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3068 if (r < 0)
3069 return r;
3070
3071 if (context->mount_propagation_flag == MS_SHARED)
3072 log_exec_debug(context,
3073 params,
3074 "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3075
3076 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3077 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3078 if (r < 0)
3079 return r;
3080 }
3081
3082 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3083 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3084 if (!propagate_dir)
3085 return -ENOMEM;
3086
3087 incoming_dir = strdup("/run/systemd/incoming");
3088 if (!incoming_dir)
3089 return -ENOMEM;
3090
3091 extension_dir = strdup("/run/systemd/unit-extensions");
3092 if (!extension_dir)
3093 return -ENOMEM;
3094
3095 /* If running under a different root filesystem, propagate the host's os-release. We make a
3096 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3097 if (setup_os_release_symlink) {
3098 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3099 if (!host_os_release_stage)
3100 return -ENOMEM;
3101 }
3102 } else {
3103 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3104
3105 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3106 return -ENOMEM;
3107
3108 if (setup_os_release_symlink) {
3109 if (asprintf(&host_os_release_stage,
3110 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3111 geteuid()) < 0)
3112 return -ENOMEM;
3113 }
3114 }
3115
3116 if (root_image) {
3117 r = verity_settings_prepare(
3118 &verity,
3119 root_image,
3120 context->root_hash, context->root_hash_size, context->root_hash_path,
3121 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3122 context->root_verity);
3123 if (r < 0)
3124 return r;
3125 }
3126
3127 NamespaceParameters parameters = {
3128 .runtime_scope = params->runtime_scope,
3129
3130 .root_directory = root_dir,
3131 .root_image = root_image,
3132 .root_image_options = context->root_image_options,
3133 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3134
3135 .read_write_paths = read_write_paths,
3136 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3137 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3138
3139 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3140 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3141
3142 .empty_directories = empty_directories,
3143 .symlinks = symlinks,
3144
3145 .bind_mounts = bind_mounts,
3146 .n_bind_mounts = n_bind_mounts,
3147
3148 .temporary_filesystems = context->temporary_filesystems,
3149 .n_temporary_filesystems = context->n_temporary_filesystems,
3150
3151 .mount_images = context->mount_images,
3152 .n_mount_images = context->n_mount_images,
3153 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3154
3155 .tmp_dir = tmp_dir,
3156 .var_tmp_dir = var_tmp_dir,
3157
3158 .creds_path = creds_path,
3159 .log_namespace = context->log_namespace,
3160 .mount_propagation_flag = context->mount_propagation_flag,
3161
3162 .verity = &verity,
3163
3164 .extension_images = context->extension_images,
3165 .n_extension_images = context->n_extension_images,
3166 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3167 .extension_directories = context->extension_directories,
3168
3169 .propagate_dir = propagate_dir,
3170 .incoming_dir = incoming_dir,
3171 .extension_dir = extension_dir,
3172 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3173 .host_os_release_stage = host_os_release_stage,
3174
3175 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3176 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3177 * sandbox inside the mount namespace. */
3178 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3179
3180 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3181 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3182 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3183 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3184 .protect_hostname = needs_sandboxing && context->protect_hostname,
3185
3186 .private_dev = needs_sandboxing && context->private_devices,
3187 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3188 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3189
3190 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3191
3192 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3193 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3194
abcf5997
FS
3195 .protect_home = needs_sandboxing ? context->protect_home : false,
3196 .protect_system = needs_sandboxing ? context->protect_system : false,
3197 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3198 .proc_subset = needs_sandboxing ? context->proc_subset : false,
75689fb2
LB
3199 };
3200
3201 r = setup_namespace(&parameters, error_path);
3202 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3203 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3204 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3205 * completely different execution environment. */
3206 if (r == -ENOANO) {
3207 if (insist_on_sandboxing(
3208 context,
3209 root_dir, root_image,
3210 bind_mounts,
3211 n_bind_mounts))
3212 return log_exec_debug_errno(context,
3213 params,
3214 SYNTHETIC_ERRNO(EOPNOTSUPP),
3215 "Failed to set up namespace, and refusing to continue since "
3216 "the selected namespacing options alter mount environment non-trivially.\n"
3217 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3218 n_bind_mounts,
3219 context->n_temporary_filesystems,
3220 yes_no(root_dir),
3221 yes_no(root_image),
3222 yes_no(context->dynamic_user));
3223
3224 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3225 return 0;
3226 }
3227
3228 return r;
3229}
3230
3231static int apply_working_directory(
3232 const ExecContext *context,
3233 const ExecParameters *params,
3234 ExecRuntime *runtime,
3235 const char *home,
3236 int *exit_status) {
3237
3238 const char *d, *wd;
3239
3240 assert(context);
3241 assert(exit_status);
3242
3243 if (context->working_directory_home) {
3244
3245 if (!home) {
3246 *exit_status = EXIT_CHDIR;
3247 return -ENXIO;
3248 }
3249
3250 wd = home;
3251
3252 } else
3253 wd = empty_to_root(context->working_directory);
3254
3255 if (params->flags & EXEC_APPLY_CHROOT)
3256 d = wd;
3257 else
3258 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3259
3260 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3261 *exit_status = EXIT_CHDIR;
3262 return -errno;
3263 }
3264
3265 return 0;
3266}
3267
3268static int apply_root_directory(
3269 const ExecContext *context,
3270 const ExecParameters *params,
3271 ExecRuntime *runtime,
3272 const bool needs_mount_ns,
3273 int *exit_status) {
3274
3275 assert(context);
3276 assert(exit_status);
3277
3278 if (params->flags & EXEC_APPLY_CHROOT)
3279 if (!needs_mount_ns && context->root_directory)
3280 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3281 *exit_status = EXIT_CHROOT;
3282 return -errno;
3283 }
3284
3285 return 0;
3286}
3287
3288static int setup_keyring(
3289 const ExecContext *context,
3290 const ExecParameters *p,
3291 uid_t uid, gid_t gid) {
3292
3293 key_serial_t keyring;
3294 int r = 0;
3295 uid_t saved_uid;
3296 gid_t saved_gid;
3297
3298 assert(context);
3299 assert(p);
3300
3301 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3302 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3303 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3304 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3305 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3306 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3307
3308 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3309 return 0;
3310
3311 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3312 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3313 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3314 * & group is just as nasty as acquiring a reference to the user keyring. */
3315
3316 saved_uid = getuid();
3317 saved_gid = getgid();
3318
3319 if (gid_is_valid(gid) && gid != saved_gid) {
3320 if (setregid(gid, -1) < 0)
3321 return log_exec_error_errno(context,
3322 p,
3323 errno,
3324 "Failed to change GID for user keyring: %m");
3325 }
3326
3327 if (uid_is_valid(uid) && uid != saved_uid) {
3328 if (setreuid(uid, -1) < 0) {
3329 r = log_exec_error_errno(context,
3330 p,
3331 errno,
3332 "Failed to change UID for user keyring: %m");
3333 goto out;
3334 }
3335 }
3336
3337 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3338 if (keyring == -1) {
3339 if (errno == ENOSYS)
3340 log_exec_debug_errno(context,
3341 p,
3342 errno,
3343 "Kernel keyring not supported, ignoring.");
3344 else if (ERRNO_IS_PRIVILEGE(errno))
3345 log_exec_debug_errno(context,
3346 p,
3347 errno,
3348 "Kernel keyring access prohibited, ignoring.");
3349 else if (errno == EDQUOT)
3350 log_exec_debug_errno(context,
3351 p,
3352 errno,
3353 "Out of kernel keyrings to allocate, ignoring.");
3354 else
3355 r = log_exec_error_errno(context,
3356 p,
3357 errno,
3358 "Setting up kernel keyring failed: %m");
3359
3360 goto out;
3361 }
3362
3363 /* When requested link the user keyring into the session keyring. */
3364 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3365
3366 if (keyctl(KEYCTL_LINK,
3367 KEY_SPEC_USER_KEYRING,
3368 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3369 r = log_exec_error_errno(context,
3370 p,
3371 errno,
3372 "Failed to link user keyring into session keyring: %m");
3373 goto out;
3374 }
3375 }
3376
3377 /* Restore uid/gid back */
3378 if (uid_is_valid(uid) && uid != saved_uid) {
3379 if (setreuid(saved_uid, -1) < 0) {
3380 r = log_exec_error_errno(context,
3381 p,
3382 errno,
3383 "Failed to change UID back for user keyring: %m");
3384 goto out;
3385 }
3386 }
3387
3388 if (gid_is_valid(gid) && gid != saved_gid) {
3389 if (setregid(saved_gid, -1) < 0)
3390 return log_exec_error_errno(context,
3391 p,
3392 errno,
3393 "Failed to change GID back for user keyring: %m");
3394 }
3395
3396 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3397 if (!sd_id128_is_null(p->invocation_id)) {
3398 key_serial_t key;
3399
3400 key = add_key("user",
3401 "invocation_id",
3402 &p->invocation_id,
3403 sizeof(p->invocation_id),
3404 KEY_SPEC_SESSION_KEYRING);
3405 if (key == -1)
3406 log_exec_debug_errno(context,
3407 p,
3408 errno,
3409 "Failed to add invocation ID to keyring, ignoring: %m");
3410 else {
3411 if (keyctl(KEYCTL_SETPERM, key,
3412 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3413 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3414 r = log_exec_error_errno(context,
3415 p,
3416 errno,
3417 "Failed to restrict invocation ID permission: %m");
3418 }
3419 }
3420
3421out:
3422 /* Revert back uid & gid for the last time, and exit */
3423 /* no extra logging, as only the first already reported error matters */
3424 if (getuid() != saved_uid)
3425 (void) setreuid(saved_uid, -1);
3426
3427 if (getgid() != saved_gid)
3428 (void) setregid(saved_gid, -1);
3429
3430 return r;
3431}
3432
3433static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3434 assert(array);
3435 assert(n);
3436 assert(pair);
3437
3438 if (pair[0] >= 0)
3439 array[(*n)++] = pair[0];
3440 if (pair[1] >= 0)
3441 array[(*n)++] = pair[1];
3442}
3443
3444static int close_remaining_fds(
3445 const ExecParameters *params,
3446 const ExecRuntime *runtime,
3447 int socket_fd,
3448 const int *fds, size_t n_fds) {
3449
3450 size_t n_dont_close = 0;
3451 int dont_close[n_fds + 14];
3452
3453 assert(params);
3454
3455 if (params->stdin_fd >= 0)
3456 dont_close[n_dont_close++] = params->stdin_fd;
3457 if (params->stdout_fd >= 0)
3458 dont_close[n_dont_close++] = params->stdout_fd;
3459 if (params->stderr_fd >= 0)
3460 dont_close[n_dont_close++] = params->stderr_fd;
3461
3462 if (socket_fd >= 0)
3463 dont_close[n_dont_close++] = socket_fd;
3464 if (n_fds > 0) {
3465 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3466 n_dont_close += n_fds;
3467 }
3468
3469 if (runtime)
3470 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3471
3472 if (runtime && runtime->shared) {
3473 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3474 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3475 }
3476
3477 if (runtime && runtime->dynamic_creds) {
3478 if (runtime->dynamic_creds->user)
3479 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3480 if (runtime->dynamic_creds->group)
3481 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3482 }
3483
3484 if (params->user_lookup_fd >= 0)
3485 dont_close[n_dont_close++] = params->user_lookup_fd;
3486
3487 return close_all_fds(dont_close, n_dont_close);
3488}
3489
3490static int send_user_lookup(
3491 const char *unit_id,
3492 int user_lookup_fd,
3493 uid_t uid,
3494 gid_t gid) {
3495
3496 assert(unit_id);
3497
3498 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3499 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3500 * specified. */
3501
3502 if (user_lookup_fd < 0)
3503 return 0;
3504
3505 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3506 return 0;
3507
3508 if (writev(user_lookup_fd,
3509 (struct iovec[]) {
3510 IOVEC_MAKE(&uid, sizeof(uid)),
3511 IOVEC_MAKE(&gid, sizeof(gid)),
3512 IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3513 return -errno;
3514
3515 return 0;
3516}
3517
3518static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3519 int r;
3520
3521 assert(c);
3522 assert(home);
3523 assert(buf);
3524
3525 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3526
3527 if (*home)
3528 return 0;
3529
3530 if (!c->working_directory_home)
3531 return 0;
3532
3533 r = get_home_dir(buf);
3534 if (r < 0)
3535 return r;
3536
3537 *home = *buf;
3538 return 1;
3539}
3540
3541static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3542 _cleanup_strv_free_ char ** list = NULL;
3543 int r;
3544
3545 assert(c);
3546 assert(p);
3547 assert(ret);
3548
3549 assert(c->dynamic_user);
3550
3551 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3552 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3553 * directories. */
3554
3555 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3556 if (t == EXEC_DIRECTORY_CONFIGURATION)
3557 continue;
3558
3559 if (!p->prefix[t])
3560 continue;
3561
3562 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3563 char *e;
3564
3565 if (exec_directory_is_private(c, t))
3566 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3567 else
3568 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3569 if (!e)
3570 return -ENOMEM;
3571
3572 r = strv_consume(&list, e);
3573 if (r < 0)
3574 return r;
3575 }
3576 }
3577
3578 *ret = TAKE_PTR(list);
3579
3580 return 0;
3581}
3582
3583static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3584 _cleanup_(cpu_set_reset) CPUSet s = {};
3585 int r;
3586
3587 assert(c);
3588 assert(ret);
3589
3590 if (!c->numa_policy.nodes.set) {
3591 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3592 return 0;
3593 }
3594
3595 r = numa_to_cpu_set(&c->numa_policy, &s);
3596 if (r < 0)
3597 return r;
3598
3599 cpu_set_reset(ret);
3600
3601 return cpu_set_add_all(ret, &s);
3602}
3603
3604static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3605 int r;
3606
3607 assert(fds);
3608 assert(n_fds);
3609 assert(*n_fds < fds_size);
3610 assert(ret_fd);
3611
3612 if (fd < 0) {
3613 *ret_fd = -EBADF;
3614 return 0;
3615 }
3616
3617 if (fd < 3 + (int) *n_fds) {
3618 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3619 * the fds we pass to the process (or which are closed only during execve). */
3620
3621 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3622 if (r < 0)
3623 return -errno;
3624
3625 close_and_replace(fd, r);
3626 }
3627
3628 *ret_fd = fds[*n_fds] = fd;
3629 (*n_fds) ++;
3630 return 1;
3631}
3632
3633static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3634 union sockaddr_union addr = {
3635 .un.sun_family = AF_UNIX,
3636 };
3637 socklen_t sa_len;
3638 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3639 int r;
3640
3641 assert(c);
3642 assert(p);
3643 assert(of);
3644 assert(ofd >= 0);
3645
3646 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3647 if (r < 0)
3648 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3649
3650 sa_len = r;
3651
3652 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3653 _cleanup_close_ int fd = -EBADF;
3654
3655 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3656 if (fd < 0)
3657 return log_exec_error_errno(c,
3658 p,
3659 errno,
3660 "Failed to create socket for %s: %m",
3661 of->path);
3662
3663 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3664 if (r == -EPROTOTYPE)
3665 continue;
3666 if (r < 0)
3667 return log_exec_error_errno(c,
3668 p,
3669 r,
3670 "Failed to connect socket for %s: %m",
3671 of->path);
3672
3673 return TAKE_FD(fd);
3674 }
3675
3676 return log_exec_error_errno(c,
3677 p,
3678 SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3679 of->path);
3680}
3681
3682static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3683 struct stat st;
3684 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3685
3686 assert(c);
3687 assert(p);
3688 assert(of);
3689
3690 ofd = open(of->path, O_PATH | O_CLOEXEC);
3691 if (ofd < 0)
3692 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3693
3694 if (fstat(ofd, &st) < 0)
3695 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3696
3697 if (S_ISSOCK(st.st_mode)) {
3698 fd = connect_unix_harder(c, p, of, ofd);
3699 if (fd < 0)
3700 return fd;
3701
3702 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3703 return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3704 of->path);
3705
3706 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3707 } else {
3708 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3709 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3710 flags |= O_APPEND;
3711 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3712 flags |= O_TRUNC;
3713
3714 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3715 if (fd < 0)
3716 return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3717
3718 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3719 }
3720
3721 return TAKE_FD(fd);
3722}
3723
3724static int collect_open_file_fds(
3725 const ExecContext *c,
3726 const ExecParameters *p,
3727 int **fds,
3728 char ***fdnames,
3729 size_t *n_fds) {
3730 int r;
3731
3732 assert(c);
3733 assert(p);
3734 assert(fds);
3735 assert(fdnames);
3736 assert(n_fds);
3737
3738 LIST_FOREACH(open_files, of, p->open_files) {
3739 _cleanup_close_ int fd = -EBADF;
3740
3741 fd = get_open_file_fd(c, p, of);
3742 if (fd < 0) {
3743 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3744 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3745 continue;
3746 }
3747
3748 return fd;
3749 }
3750
3751 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3752 return -ENOMEM;
3753
3754 r = strv_extend(fdnames, of->fdname);
3755 if (r < 0)
3756 return r;
3757
3758 (*fds)[*n_fds] = TAKE_FD(fd);
3759
3760 (*n_fds)++;
3761 }
3762
3763 return 0;
3764}
3765
3766static void log_command_line(
3767 const ExecContext *context,
3768 const ExecParameters *params,
3769 const char *msg,
3770 const char *executable,
3771 char **argv) {
3772
3773 assert(context);
3774 assert(params);
3775 assert(msg);
3776 assert(executable);
3777
3778 if (!DEBUG_LOGGING)
3779 return;
3780
3781 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3782
3783 log_exec_struct(context, params, LOG_DEBUG,
3784 "EXECUTABLE=%s", executable,
3785 LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3786 LOG_EXEC_INVOCATION_ID(params));
3787}
3788
3789static bool exec_context_need_unprivileged_private_users(
3790 const ExecContext *context,
3791 const ExecParameters *params) {
3792
3793 assert(context);
3794 assert(params);
3795
3796 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3797 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3798 * (system manager) then we have privileges and don't need this. */
3799 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3800 return false;
3801
3802 return context->private_users ||
3803 context->private_tmp ||
3804 context->private_devices ||
3805 context->private_network ||
3806 context->network_namespace_path ||
3807 context->private_ipc ||
3808 context->ipc_namespace_path ||
3809 context->private_mounts > 0 ||
3810 context->mount_apivfs ||
3811 context->n_bind_mounts > 0 ||
3812 context->n_temporary_filesystems > 0 ||
3813 context->root_directory ||
3814 !strv_isempty(context->extension_directories) ||
3815 context->protect_system != PROTECT_SYSTEM_NO ||
3816 context->protect_home != PROTECT_HOME_NO ||
3817 context->protect_kernel_tunables ||
3818 context->protect_kernel_modules ||
3819 context->protect_kernel_logs ||
3820 context->protect_control_groups ||
3821 context->protect_clock ||
3822 context->protect_hostname ||
3823 !strv_isempty(context->read_write_paths) ||
3824 !strv_isempty(context->read_only_paths) ||
3825 !strv_isempty(context->inaccessible_paths) ||
3826 !strv_isempty(context->exec_paths) ||
3827 !strv_isempty(context->no_exec_paths);
3828}
3829
3830static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3831 assert(context);
3832
3833 if (confirm_spawn_disabled())
3834 return false;
3835
3836 /* For some reasons units remaining in the same process group
3837 * as PID 1 fail to acquire the console even if it's not used
3838 * by any process. So skip the confirmation question for them. */
3839 return !context->same_pgrp;
3840}
3841
3842static int exec_context_named_iofds(
3843 const ExecContext *c,
3844 const ExecParameters *p,
3845 int named_iofds[static 3]) {
3846
3847 size_t targets;
3848 const char* stdio_fdname[3];
3849 size_t n_fds;
3850
3851 assert(c);
3852 assert(p);
3853 assert(named_iofds);
3854
3855 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3856 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3857 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3858
3859 for (size_t i = 0; i < 3; i++)
3860 stdio_fdname[i] = exec_context_fdname(c, i);
3861
3862 n_fds = p->n_storage_fds + p->n_socket_fds;
3863
3864 for (size_t i = 0; i < n_fds && targets > 0; i++)
3865 if (named_iofds[STDIN_FILENO] < 0 &&
3866 c->std_input == EXEC_INPUT_NAMED_FD &&
3867 stdio_fdname[STDIN_FILENO] &&
3868 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3869
3870 named_iofds[STDIN_FILENO] = p->fds[i];
3871 targets--;
3872
3873 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3874 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3875 stdio_fdname[STDOUT_FILENO] &&
3876 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3877
3878 named_iofds[STDOUT_FILENO] = p->fds[i];
3879 targets--;
3880
3881 } else if (named_iofds[STDERR_FILENO] < 0 &&
3882 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3883 stdio_fdname[STDERR_FILENO] &&
3884 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3885
3886 named_iofds[STDERR_FILENO] = p->fds[i];
3887 targets--;
3888 }
3889
3890 return targets == 0 ? 0 : -ENOENT;
3891}
3892
7b6d3dcd
LB
3893static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3894 if (!shared)
3895 return;
3896
3897 safe_close_pair(shared->netns_storage_socket);
3898 safe_close_pair(shared->ipcns_storage_socket);
3899}
3900
3901static void exec_runtime_close(ExecRuntime *rt) {
3902 if (!rt)
3903 return;
3904
3905 safe_close_pair(rt->ephemeral_storage_socket);
3906
3907 exec_shared_runtime_close(rt->shared);
3908 dynamic_creds_close(rt->dynamic_creds);
3909}
3910
3911static void exec_params_close(ExecParameters *p) {
3912 if (!p)
3913 return;
3914
3915 p->stdin_fd = safe_close(p->stdin_fd);
3916 p->stdout_fd = safe_close(p->stdout_fd);
3917 p->stderr_fd = safe_close(p->stderr_fd);
3918}
3919
75689fb2
LB
3920int exec_invoke(
3921 const ExecCommand *command,
3922 const ExecContext *context,
3923 ExecParameters *params,
3924 ExecRuntime *runtime,
3925 const CGroupContext *cgroup_context,
3926 int *exit_status) {
3927
3928 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3929 int r, ngids = 0, exec_fd;
3930 _cleanup_free_ gid_t *supplementary_gids = NULL;
3931 const char *username = NULL, *groupname = NULL;
3932 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3933 const char *home = NULL, *shell = NULL;
3934 char **final_argv = NULL;
3935 dev_t journal_stream_dev = 0;
3936 ino_t journal_stream_ino = 0;
3937 bool userns_set_up = false;
3938 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3939 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3940 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3941 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
24832d10 3942 bool keep_seccomp_privileges = false;
75689fb2
LB
3943#if HAVE_SELINUX
3944 _cleanup_free_ char *mac_selinux_context_net = NULL;
3945 bool use_selinux = false;
3946#endif
3947#if ENABLE_SMACK
3948 bool use_smack = false;
3949#endif
3950#if HAVE_APPARMOR
3951 bool use_apparmor = false;
24832d10
ILG
3952#endif
3953#if HAVE_SECCOMP
3954 uint64_t saved_bset = 0;
75689fb2
LB
3955#endif
3956 uid_t saved_uid = getuid();
3957 gid_t saved_gid = getgid();
3958 uid_t uid = UID_INVALID;
3959 gid_t gid = GID_INVALID;
3960 size_t n_fds, /* fds to pass to the child */
3961 n_keep_fds; /* total number of fds not to close */
3962 int secure_bits;
3963 _cleanup_free_ gid_t *gids_after_pam = NULL;
3964 int ngids_after_pam = 0;
3965 _cleanup_free_ int *fds = NULL;
3966 _cleanup_strv_free_ char **fdnames = NULL;
3967
71136404 3968 int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET, *params_fds = NULL;
75689fb2
LB
3969 size_t n_storage_fds = 0, n_socket_fds = 0;
3970
3971 assert(command);
3972 assert(context);
3973 assert(params);
3974 assert(exit_status);
3975
3976 /* Explicitly test for CVE-2021-4034 inspired invocations */
856bed0a
LB
3977 if (!command->path || strv_isempty(command->argv)) {
3978 *exit_status = EXIT_EXEC;
3979 return log_exec_error_errno(
3980 context,
3981 params,
3982 SYNTHETIC_ERRNO(EINVAL),
3983 "Invalid command line arguments.");
3984 }
75689fb2
LB
3985
3986 LOG_CONTEXT_PUSH_EXEC(context, params);
3987
3988 if (context->std_input == EXEC_INPUT_SOCKET ||
3989 context->std_output == EXEC_OUTPUT_SOCKET ||
3990 context->std_error == EXEC_OUTPUT_SOCKET) {
3991
3992 if (params->n_socket_fds > 1)
3993 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
3994
3995 if (params->n_socket_fds == 0)
3996 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
3997
3998 socket_fd = params->fds[0];
3999 } else {
4000 params_fds = params->fds;
4001 n_socket_fds = params->n_socket_fds;
4002 n_storage_fds = params->n_storage_fds;
4003 }
4004 n_fds = n_socket_fds + n_storage_fds;
4005
4006 r = exec_context_named_iofds(context, params, named_iofds);
4007 if (r < 0)
4008 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
4009
4010 rename_process_from_path(command->path);
4011
4012 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4013 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4014 * both of which will be demoted to SIG_DFL. */
4015 (void) default_signals(SIGNALS_CRASH_HANDLER,
4016 SIGNALS_IGNORE);
4017
4018 if (context->ignore_sigpipe)
4019 (void) ignore_signals(SIGPIPE);
4020
4021 r = reset_signal_mask();
4022 if (r < 0) {
4023 *exit_status = EXIT_SIGNAL_MASK;
4024 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4025 }
4026
4027 if (params->idle_pipe)
4028 do_idle_pipe_dance(params->idle_pipe);
4029
4030 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4031 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4032 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4033 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4034
4035 log_forget_fds();
4036 log_set_open_when_needed(true);
4037 log_settle_target();
4038 if (context->log_level_max >= 0)
4039 log_set_max_level(context->log_level_max);
4040
4041 /* In case anything used libc syslog(), close this here, too */
4042 closelog();
4043
4044 fds = newdup(int, params_fds, n_fds);
4045 if (!fds) {
4046 *exit_status = EXIT_MEMORY;
4047 return log_oom();
4048 }
4049
4050 fdnames = strv_copy((char**) params->fd_names);
4051 if (!fdnames) {
4052 *exit_status = EXIT_MEMORY;
4053 return log_oom();
4054 }
4055
4056 r = collect_open_file_fds(context, params, &fds, &fdnames, &n_fds);
4057 if (r < 0) {
4058 *exit_status = EXIT_FDS;
4059 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4060 }
4061
4062 int keep_fds[n_fds + 3];
4063 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4064 n_keep_fds = n_fds;
4065
4066 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4067 if (r < 0) {
4068 *exit_status = EXIT_FDS;
4069 return log_exec_error_errno(context, params, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4070 }
4071
4072#if HAVE_LIBBPF
4073 if (params->bpf_outer_map_fd >= 0) {
4074 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->bpf_outer_map_fd, (int *)&params->bpf_outer_map_fd);
4075 if (r < 0) {
4076 *exit_status = EXIT_FDS;
4077 return log_exec_error_errno(context, params, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4078 }
4079 }
4080#endif
4081
4082 r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4083 if (r < 0) {
4084 *exit_status = EXIT_FDS;
4085 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4086 }
4087
4088 if (!context->same_pgrp &&
4089 setsid() < 0) {
4090 *exit_status = EXIT_SETSID;
4091 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4092 }
4093
4094 exec_context_tty_reset(context, params);
4095
4096 if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4097 _cleanup_free_ char *cmdline = NULL;
4098
4099 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4100 if (!cmdline) {
4101 *exit_status = EXIT_MEMORY;
4102 return log_oom();
4103 }
4104
4105 r = ask_for_confirmation(context, params, cmdline);
4106 if (r != CONFIRM_EXECUTE) {
4107 if (r == CONFIRM_PRETEND_SUCCESS) {
4108 *exit_status = EXIT_SUCCESS;
4109 return 0;
4110 }
4111
4112 *exit_status = EXIT_CONFIRM;
4113 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4114 "Execution cancelled by the user");
4115 }
4116 }
4117
4118 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4119 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4120 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4121 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4122 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4123 if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4124 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4125 *exit_status = EXIT_MEMORY;
4126 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4127 }
4128
4129 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4130 _cleanup_strv_free_ char **suggested_paths = NULL;
4131
4132 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4133 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4134 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4135 *exit_status = EXIT_USER;
4136 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4137 }
4138
4139 r = compile_suggested_paths(context, params, &suggested_paths);
4140 if (r < 0) {
4141 *exit_status = EXIT_MEMORY;
4142 return log_oom();
4143 }
4144
4145 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4146 if (r < 0) {
4147 *exit_status = EXIT_USER;
4148 if (r == -EILSEQ)
4149 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4150 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4151 return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4152 }
4153
4154 if (!uid_is_valid(uid)) {
4155 *exit_status = EXIT_USER;
4156 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4157 }
4158
4159 if (!gid_is_valid(gid)) {
4160 *exit_status = EXIT_USER;
4161 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4162 }
4163
4164 if (runtime->dynamic_creds->user)
4165 username = runtime->dynamic_creds->user->name;
4166
4167 } else {
4168 if (context->user) {
4169 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4170 if (r < 0) {
4171 *exit_status = EXIT_USER;
4172 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4173 }
4174 }
4175
4176 if (context->group) {
4177 r = get_fixed_group(context->group, &groupname, &gid);
4178 if (r < 0) {
4179 *exit_status = EXIT_GROUP;
4180 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4181 }
4182 }
4183 }
4184
4185 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4186 r = get_supplementary_groups(context, username, groupname, gid,
4187 &supplementary_gids, &ngids);
4188 if (r < 0) {
4189 *exit_status = EXIT_GROUP;
4190 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4191 }
4192
4193 r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4194 if (r < 0) {
4195 *exit_status = EXIT_USER;
4196 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4197 }
4198
4199 params->user_lookup_fd = safe_close(params->user_lookup_fd);
4200
4201 r = acquire_home(context, uid, &home, &home_buffer);
4202 if (r < 0) {
4203 *exit_status = EXIT_CHDIR;
4204 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4205 }
4206
4207 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4208 if (socket_fd >= 0)
4209 (void) fd_nonblock(socket_fd, false);
4210
4211 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4212 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4213 if (params->cgroup_path) {
4214 _cleanup_free_ char *p = NULL;
4215
4216 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4217 if (r < 0) {
4218 *exit_status = EXIT_CGROUP;
4219 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4220 }
4221
4222 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4223 if (r == -EUCLEAN) {
4224 *exit_status = EXIT_CGROUP;
4225 return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4226 "because the cgroup or one of its parents or "
4227 "siblings is in the threaded mode: %m", p);
4228 }
4229 if (r < 0) {
4230 *exit_status = EXIT_CGROUP;
4231 return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4232 }
4233 }
4234
4235 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4236 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4237 if (r < 0) {
4238 *exit_status = EXIT_NETWORK;
4239 return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4240 }
4241 }
4242
4243 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4244 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4245 if (r < 0) {
4246 *exit_status = EXIT_NAMESPACE;
4247 return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4248 }
4249 }
4250
4251 r = setup_input(context, params, socket_fd, named_iofds);
4252 if (r < 0) {
4253 *exit_status = EXIT_STDIN;
4254 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4255 }
4256
4257 r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4258 if (r < 0) {
4259 *exit_status = EXIT_STDOUT;
4260 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4261 }
4262
4263 r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4264 if (r < 0) {
4265 *exit_status = EXIT_STDERR;
4266 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4267 }
4268
4269 if (context->oom_score_adjust_set) {
4270 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4271 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4272 r = set_oom_score_adjust(context->oom_score_adjust);
4273 if (ERRNO_IS_NEG_PRIVILEGE(r))
4274 log_exec_debug_errno(context, params, r,
4275 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4276 else if (r < 0) {
4277 *exit_status = EXIT_OOM_ADJUST;
4278 return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4279 }
4280 }
4281
4282 if (context->coredump_filter_set) {
4283 r = set_coredump_filter(context->coredump_filter);
4284 if (ERRNO_IS_NEG_PRIVILEGE(r))
4285 log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4286 else if (r < 0) {
4287 *exit_status = EXIT_LIMITS;
4288 return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4289 }
4290 }
4291
4292 if (context->nice_set) {
4293 r = setpriority_closest(context->nice);
4294 if (r < 0) {
4295 *exit_status = EXIT_NICE;
4296 return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4297 }
4298 }
4299
4300 if (context->cpu_sched_set) {
4301 struct sched_param param = {
4302 .sched_priority = context->cpu_sched_priority,
4303 };
4304
4305 r = sched_setscheduler(0,
4306 context->cpu_sched_policy |
4307 (context->cpu_sched_reset_on_fork ?
4308 SCHED_RESET_ON_FORK : 0),
4309 &param);
4310 if (r < 0) {
4311 *exit_status = EXIT_SETSCHEDULER;
4312 return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4313 }
4314 }
4315
4316 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4317 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4318 const CPUSet *cpu_set;
4319
4320 if (context->cpu_affinity_from_numa) {
4321 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4322 if (r < 0) {
4323 *exit_status = EXIT_CPUAFFINITY;
4324 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4325 }
4326
4327 cpu_set = &converted_cpu_set;
4328 } else
4329 cpu_set = &context->cpu_set;
4330
4331 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4332 *exit_status = EXIT_CPUAFFINITY;
4333 return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4334 }
4335 }
4336
4337 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4338 r = apply_numa_policy(&context->numa_policy);
4339 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4340 log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4341 else if (r < 0) {
4342 *exit_status = EXIT_NUMA_POLICY;
4343 return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4344 }
4345 }
4346
4347 if (context->ioprio_set)
4348 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4349 *exit_status = EXIT_IOPRIO;
4350 return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4351 }
4352
4353 if (context->timer_slack_nsec != NSEC_INFINITY)
4354 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4355 *exit_status = EXIT_TIMERSLACK;
4356 return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4357 }
4358
4359 if (context->personality != PERSONALITY_INVALID) {
4360 r = safe_personality(context->personality);
4361 if (r < 0) {
4362 *exit_status = EXIT_PERSONALITY;
4363 return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4364 }
4365 }
4366
de3612db 4367#if ENABLE_UTMP
75689fb2
LB
4368 if (context->utmp_id) {
4369 const char *line = context->tty_path ?
4370 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4371 NULL;
4372 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4373 line,
4374 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4375 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4376 USER_PROCESS,
4377 username);
4378 }
de3612db 4379#endif
75689fb2
LB
4380
4381 if (uid_is_valid(uid)) {
4382 r = chown_terminal(STDIN_FILENO, uid);
4383 if (r < 0) {
4384 *exit_status = EXIT_STDIN;
4385 return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4386 }
4387 }
4388
4389 if (params->cgroup_path) {
4390 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4391 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4392 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4393 * touch a single hierarchy too. */
4394
4395 if (params->flags & EXEC_CGROUP_DELEGATE) {
4396 _cleanup_free_ char *p = NULL;
4397
4398 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4399 if (r < 0) {
4400 *exit_status = EXIT_CGROUP;
4401 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4402 }
4403
4404 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4405 if (r < 0) {
4406 *exit_status = EXIT_CGROUP;
4407 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4408 }
4409 if (r > 0) {
4410 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4411 if (r < 0) {
4412 *exit_status = EXIT_CGROUP;
4413 return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4414 }
4415 }
4416 }
4417
4418 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4419 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4420 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4421 if (r < 0) {
4422 *exit_status = EXIT_MEMORY;
4423 return log_oom();
4424 }
4425
4426 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4427 if (r < 0) {
4428 log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4429 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4430 memory_pressure_path = mfree(memory_pressure_path);
4431 }
4432 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4433 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4434 if (!memory_pressure_path) {
4435 *exit_status = EXIT_MEMORY;
4436 return log_oom();
4437 }
4438 }
4439 }
4440 }
4441
4442 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4443
4444 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4445 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4446 if (r < 0)
4447 return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4448 }
4449
4450 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4451 r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4452 if (r < 0) {
4453 *exit_status = EXIT_CREDENTIALS;
4454 return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4455 }
4456 }
4457
4458 r = build_environment(
4459 context,
4460 params,
4461 cgroup_context,
4462 n_fds,
4463 fdnames,
4464 home,
4465 username,
4466 shell,
4467 journal_stream_dev,
4468 journal_stream_ino,
4469 memory_pressure_path,
4470 &our_env);
4471 if (r < 0) {
4472 *exit_status = EXIT_MEMORY;
4473 return log_oom();
4474 }
4475
4476 r = build_pass_environment(context, &pass_env);
4477 if (r < 0) {
4478 *exit_status = EXIT_MEMORY;
4479 return log_oom();
4480 }
4481
4482 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4483 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4484 * not specify PATH but the unit has ExecSearchPath. */
4485 if (!strv_isempty(context->exec_search_path)) {
4486 _cleanup_free_ char *joined = NULL;
4487
4488 joined = strv_join(context->exec_search_path, ":");
4489 if (!joined) {
4490 *exit_status = EXIT_MEMORY;
4491 return log_oom();
4492 }
4493
4494 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4495 if (r < 0) {
4496 *exit_status = EXIT_MEMORY;
4497 return log_oom();
4498 }
4499 }
4500
4501 accum_env = strv_env_merge(params->environment,
4502 our_env,
4503 joined_exec_search_path,
4504 pass_env,
4505 context->environment,
4506 params->files_env);
4507 if (!accum_env) {
4508 *exit_status = EXIT_MEMORY;
4509 return log_oom();
4510 }
4511 accum_env = strv_env_clean(accum_env);
4512
4513 (void) umask(context->umask);
4514
4515 r = setup_keyring(context, params, uid, gid);
4516 if (r < 0) {
4517 *exit_status = EXIT_KEYRING;
4518 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4519 }
4520
4521 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4522 * from it. */
4523 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4524
4525 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4526 * for it, and the kernel doesn't actually support ambient caps. */
4527 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4528
4529 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4530 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4531 * desired. */
4532 if (needs_ambient_hack)
4533 needs_setuid = false;
4534 else
4535 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4536
4537 uint64_t capability_ambient_set = context->capability_ambient_set;
4538
4539 if (needs_sandboxing) {
4540 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4541 * /sys being present. The actual MAC context application will happen later, as late as
4542 * possible, to avoid impacting our own code paths. */
4543
4544#if HAVE_SELINUX
4545 use_selinux = mac_selinux_use();
4546#endif
4547#if ENABLE_SMACK
4548 use_smack = mac_smack_use();
4549#endif
4550#if HAVE_APPARMOR
4551 use_apparmor = mac_apparmor_use();
4552#endif
4553 }
4554
4555 if (needs_sandboxing) {
4556 int which_failed;
4557
4558 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4559 * is set here. (See below.) */
4560
4561 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4562 if (r < 0) {
4563 *exit_status = EXIT_LIMITS;
4564 return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4565 }
4566 }
4567
4568 if (needs_setuid && context->pam_name && username) {
6634e66d 4569 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
75689fb2
LB
4570 * wins here. (See above.) */
4571
4572 /* All fds passed in the fds array will be closed in the pam child process. */
4573 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4574 if (r < 0) {
4575 *exit_status = EXIT_PAM;
4576 return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4577 }
4578
4579 if (ambient_capabilities_supported()) {
4580 uint64_t ambient_after_pam;
4581
4582 /* PAM modules might have set some ambient caps. Query them here and merge them into
4583 * the caps we want to set in the end, so that we don't end up unsetting them. */
4584 r = capability_get_ambient(&ambient_after_pam);
4585 if (r < 0) {
4586 *exit_status = EXIT_CAPABILITIES;
4587 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4588 }
4589
4590 capability_ambient_set |= ambient_after_pam;
4591 }
4592
4593 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4594 if (ngids_after_pam < 0) {
dbc0342e 4595 *exit_status = EXIT_GROUP;
75689fb2
LB
4596 return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4597 }
4598 }
4599
4600 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4601 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4602 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
6634e66d 4603 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
75689fb2
LB
4604
4605 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4606 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4607 * the actual requested operations fail (or silently continue). */
4608 if (r < 0 && context->private_users) {
4609 *exit_status = EXIT_USER;
4610 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4611 }
4612 if (r < 0)
4613 log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4614 else
4615 userns_set_up = true;
4616 }
4617
4618 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4619
4620 /* Try to enable network namespacing if network namespacing is available and we have
4621 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4622 * new network namespace. And if we don't have that, then we could only create a network
4623 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4624 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4625 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4626 if (ERRNO_IS_NEG_PRIVILEGE(r))
4627 log_exec_notice_errno(context, params, r,
4628 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4629 else if (r < 0) {
4630 *exit_status = EXIT_NETWORK;
4631 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4632 }
4633 } else if (context->network_namespace_path) {
4634 *exit_status = EXIT_NETWORK;
4635 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4636 "NetworkNamespacePath= is not supported, refusing.");
4637 } else
4638 log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4639 }
4640
4641 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4642
4643 if (ns_type_supported(NAMESPACE_IPC)) {
4644 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4645 if (r == -EPERM)
4646 log_exec_warning_errno(context, params, r,
4647 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4648 else if (r < 0) {
4649 *exit_status = EXIT_NAMESPACE;
4650 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4651 }
4652 } else if (context->ipc_namespace_path) {
4653 *exit_status = EXIT_NAMESPACE;
4654 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4655 "IPCNamespacePath= is not supported, refusing.");
4656 } else
4657 log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4658 }
4659
4660 if (needs_mount_namespace) {
4661 _cleanup_free_ char *error_path = NULL;
4662
4663 r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path);
4664 if (r < 0) {
4665 *exit_status = EXIT_NAMESPACE;
4666 return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4667 error_path ? ": " : "", strempty(error_path));
4668 }
4669 }
4670
4671 if (needs_sandboxing) {
4672 r = apply_protect_hostname(context, params, exit_status);
4673 if (r < 0)
4674 return r;
4675 }
4676
4677 if (context->memory_ksm >= 0)
4678 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4679 if (ERRNO_IS_NOT_SUPPORTED(errno))
4680 log_exec_debug_errno(context,
4681 params,
4682 errno,
4683 "KSM support not available, ignoring.");
4684 else {
4685 *exit_status = EXIT_KSM;
4686 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4687 }
4688 }
4689
4690 /* Drop groups as early as possible.
6634e66d 4691 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
75689fb2
LB
4692 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4693 if (needs_setuid) {
4694 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4695 int ngids_to_enforce = 0;
4696
4697 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4698 ngids,
4699 gids_after_pam,
4700 ngids_after_pam,
4701 &gids_to_enforce);
4702 if (ngids_to_enforce < 0) {
dbc0342e 4703 *exit_status = EXIT_GROUP;
75689fb2
LB
4704 return log_exec_error_errno(context, params,
4705 ngids_to_enforce,
4706 "Failed to merge group lists. Group membership might be incorrect: %m");
4707 }
4708
4709 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4710 if (r < 0) {
4711 *exit_status = EXIT_GROUP;
4712 return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4713 }
4714 }
4715
4716 /* If the user namespace was not set up above, try to do it now.
4717 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4718 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4719 * case of mount namespaces being less privileged when the mount point list is copied from a
4720 * different user namespace). */
4721
4722 if (needs_sandboxing && context->private_users && !userns_set_up) {
4723 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4724 if (r < 0) {
4725 *exit_status = EXIT_USER;
4726 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4727 }
4728 }
4729
4730 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4731 * shall execute. */
4732
4733 _cleanup_free_ char *executable = NULL;
4734 _cleanup_close_ int executable_fd = -EBADF;
4735 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4736 if (r < 0) {
4737 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4738 log_exec_struct_errno(context, params, LOG_INFO, r,
4739 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4740 LOG_EXEC_INVOCATION_ID(params),
4741 LOG_EXEC_MESSAGE(params,
4742 "Executable %s missing, skipping: %m",
4743 command->path),
4744 "EXECUTABLE=%s", command->path);
4745 *exit_status = EXIT_SUCCESS;
4746 return 0;
4747 }
4748
4749 *exit_status = EXIT_EXEC;
4750 return log_exec_struct_errno(context, params, LOG_INFO, r,
4751 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4752 LOG_EXEC_INVOCATION_ID(params),
4753 LOG_EXEC_MESSAGE(params,
4754 "Failed to locate executable %s: %m",
4755 command->path),
4756 "EXECUTABLE=%s", command->path);
4757 }
4758
4759 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4760 if (r < 0) {
4761 *exit_status = EXIT_FDS;
4762 return log_exec_error_errno(context, params, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4763 }
4764
4765#if HAVE_SELINUX
4766 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4767 int fd = -EBADF;
4768
4769 if (socket_fd >= 0)
4770 fd = socket_fd;
4771 else if (params->n_socket_fds == 1)
4772 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4773 * use context from that fd to compute the label. */
4774 fd = params->fds[0];
4775
4776 if (fd >= 0) {
4777 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4778 if (r < 0) {
4779 if (!context->selinux_context_ignore) {
4780 *exit_status = EXIT_SELINUX_CONTEXT;
4781 return log_exec_error_errno(context,
4782 params,
4783 r,
4784 "Failed to determine SELinux context: %m");
4785 }
4786 log_exec_debug_errno(context,
4787 params,
4788 r,
4789 "Failed to determine SELinux context, ignoring: %m");
4790 }
4791 }
4792 }
4793#endif
4794
4795 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4796 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4797 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
7b6d3dcd
LB
4798 * execve(). But first, close the remaining sockets in the context objects. */
4799
4800 exec_runtime_close(runtime);
4801 exec_params_close(params);
75689fb2
LB
4802
4803 r = close_all_fds(keep_fds, n_keep_fds);
4804 if (r >= 0)
4805 r = shift_fds(fds, n_fds);
4806 if (r >= 0)
4807 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4808 if (r < 0) {
4809 *exit_status = EXIT_FDS;
4810 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4811 }
4812
4813 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4814 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4815 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4816 * came this far. */
4817
4818 secure_bits = context->secure_bits;
4819
4820 if (needs_sandboxing) {
4821 uint64_t bset;
4822
4823 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4824 * (Note this is placed after the general resource limit initialization, see above, in order
4825 * to take precedence.) */
4826 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4827 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4828 *exit_status = EXIT_LIMITS;
4829 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4830 }
4831 }
4832
4833#if ENABLE_SMACK
4834 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4835 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4836 if (use_smack && context->smack_process_label) {
4837 r = setup_smack(params, context, executable_fd);
4838 if (r < 0 && !context->smack_process_label_ignore) {
4839 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4840 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4841 }
4842 }
4843#endif
4844
4845 bset = context->capability_bounding_set;
4846 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4847 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4848 * instead of us doing that */
4849 if (needs_ambient_hack)
4850 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4851 (UINT64_C(1) << CAP_SETUID) |
4852 (UINT64_C(1) << CAP_SETGID);
4853
24832d10
ILG
4854#if HAVE_SECCOMP
4855 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4856 * keep the needed privileges to apply it even if we're not root. */
4857 if (needs_setuid &&
4858 uid_is_valid(uid) &&
4859 context_has_seccomp(context) &&
4860 seccomp_allows_drop_privileges(context)) {
4861 keep_seccomp_privileges = true;
4862
4863 if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4864 *exit_status = EXIT_USER;
4865 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4866 }
4867
4868 /* Save the current bounding set so we can restore it after applying the seccomp
4869 * filter */
4870 saved_bset = bset;
4871 bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4872 (UINT64_C(1) << CAP_SETPCAP);
4873 }
4874#endif
4875
75689fb2
LB
4876 if (!cap_test_all(bset)) {
4877 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4878 if (r < 0) {
4879 *exit_status = EXIT_CAPABILITIES;
4880 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4881 }
4882 }
4883
4884 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4885 * keep-caps set.
4886 *
4887 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4888 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4889 * the ambient capabilities can be raised as they are present in the permitted and
4890 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4891 * without changing the user, so we also set the ambient capabilities here.
4892 *
4893 * The requested ambient capabilities are raised in the inheritable set if the second
4894 * argument is true. */
4895 if (!needs_ambient_hack) {
4896 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4897 if (r < 0) {
4898 *exit_status = EXIT_CAPABILITIES;
4899 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4900 }
4901 }
4902 }
4903
4904 /* chroot to root directory first, before we lose the ability to chroot */
4905 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4906 if (r < 0)
4907 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
4908
4909 if (needs_setuid) {
4910 if (uid_is_valid(uid)) {
4911 r = enforce_user(context, uid, capability_ambient_set);
4912 if (r < 0) {
4913 *exit_status = EXIT_USER;
4914 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
4915 }
4916
24832d10
ILG
4917 if (keep_seccomp_privileges) {
4918 r = drop_capability(CAP_SETUID);
4919 if (r < 0) {
4920 *exit_status = EXIT_USER;
4921 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
4922 }
4923
4924 r = keep_capability(CAP_SYS_ADMIN);
4925 if (r < 0) {
4926 *exit_status = EXIT_USER;
4927 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
4928 }
4929
4930 r = keep_capability(CAP_SETPCAP);
4931 if (r < 0) {
4932 *exit_status = EXIT_USER;
4933 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
4934 }
4935 }
4936
75689fb2
LB
4937 if (!needs_ambient_hack && capability_ambient_set != 0) {
4938
4939 /* Raise the ambient capabilities after user change. */
4940 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4941 if (r < 0) {
4942 *exit_status = EXIT_CAPABILITIES;
4943 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
4944 }
4945 }
4946 }
4947 }
4948
4949 /* Apply working directory here, because the working directory might be on NFS and only the user running
4950 * this service might have the correct privilege to change to the working directory */
4951 r = apply_working_directory(context, params, runtime, home, exit_status);
4952 if (r < 0)
4953 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
4954
4955 if (needs_sandboxing) {
4956 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4957 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4958 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4959 * are restricted. */
4960
4961#if HAVE_SELINUX
4962 if (use_selinux) {
4963 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4964
4965 if (exec_context) {
4966 r = setexeccon(exec_context);
4967 if (r < 0) {
4968 if (!context->selinux_context_ignore) {
4969 *exit_status = EXIT_SELINUX_CONTEXT;
4970 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
4971 }
4972 log_exec_debug_errno(context,
4973 params,
4974 r,
4975 "Failed to change SELinux context to %s, ignoring: %m",
4976 exec_context);
4977 }
4978 }
4979 }
4980#endif
4981
4982#if HAVE_APPARMOR
4983 if (use_apparmor && context->apparmor_profile) {
4984 r = aa_change_onexec(context->apparmor_profile);
4985 if (r < 0 && !context->apparmor_profile_ignore) {
4986 *exit_status = EXIT_APPARMOR_PROFILE;
4987 return log_exec_error_errno(context,
4988 params,
4989 errno,
4990 "Failed to prepare AppArmor profile change to %s: %m",
4991 context->apparmor_profile);
4992 }
4993 }
4994#endif
4995
4996 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4997 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4998 * requires CAP_SETPCAP. */
4999 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5000 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5001 * effective set here.
5002 *
5003 * The effective set is overwritten during execve() with the following values:
5004 *
5005 * - ambient set (for non-root processes)
5006 *
5007 * - (inheritable | bounding) set for root processes)
5008 *
5009 * Hence there is no security impact to raise it in the effective set before execve
5010 */
5011 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5012 if (r < 0) {
5013 *exit_status = EXIT_CAPABILITIES;
5014 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5015 }
5016 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5017 *exit_status = EXIT_SECUREBITS;
5018 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
5019 }
5020 }
5021
5022 if (context_has_no_new_privileges(context))
5023 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5024 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5025 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
5026 }
5027
5028#if HAVE_SECCOMP
5029 r = apply_address_families(context, params);
5030 if (r < 0) {
5031 *exit_status = EXIT_ADDRESS_FAMILIES;
5032 return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
5033 }
5034
5035 r = apply_memory_deny_write_execute(context, params);
5036 if (r < 0) {
5037 *exit_status = EXIT_SECCOMP;
5038 return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5039 }
5040
5041 r = apply_restrict_realtime(context, params);
5042 if (r < 0) {
5043 *exit_status = EXIT_SECCOMP;
5044 return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5045 }
5046
5047 r = apply_restrict_suid_sgid(context, params);
5048 if (r < 0) {
5049 *exit_status = EXIT_SECCOMP;
5050 return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5051 }
5052
5053 r = apply_restrict_namespaces(context, params);
5054 if (r < 0) {
5055 *exit_status = EXIT_SECCOMP;
5056 return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5057 }
5058
5059 r = apply_protect_sysctl(context, params);
5060 if (r < 0) {
5061 *exit_status = EXIT_SECCOMP;
5062 return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5063 }
5064
5065 r = apply_protect_kernel_modules(context, params);
5066 if (r < 0) {
5067 *exit_status = EXIT_SECCOMP;
5068 return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5069 }
5070
5071 r = apply_protect_kernel_logs(context, params);
5072 if (r < 0) {
5073 *exit_status = EXIT_SECCOMP;
5074 return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5075 }
5076
5077 r = apply_protect_clock(context, params);
5078 if (r < 0) {
5079 *exit_status = EXIT_SECCOMP;
5080 return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5081 }
5082
5083 r = apply_private_devices(context, params);
5084 if (r < 0) {
5085 *exit_status = EXIT_SECCOMP;
5086 return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5087 }
5088
5089 r = apply_syscall_archs(context, params);
5090 if (r < 0) {
5091 *exit_status = EXIT_SECCOMP;
5092 return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5093 }
5094
5095 r = apply_lock_personality(context, params);
5096 if (r < 0) {
5097 *exit_status = EXIT_SECCOMP;
5098 return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5099 }
5100
5101 r = apply_syscall_log(context, params);
5102 if (r < 0) {
5103 *exit_status = EXIT_SECCOMP;
5104 return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5105 }
24832d10 5106#endif
75689fb2 5107
24832d10
ILG
5108#if HAVE_LIBBPF
5109 r = apply_restrict_filesystems(context, params);
5110 if (r < 0) {
5111 *exit_status = EXIT_BPF;
5112 return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5113 }
5114#endif
5115
5116#if HAVE_SECCOMP
5117 /* This really should remain as close to the execve() as possible, to make sure our own code is unaffected
75689fb2
LB
5118 * by the filter as little as possible. */
5119 r = apply_syscall_filter(context, params, needs_ambient_hack);
5120 if (r < 0) {
5121 *exit_status = EXIT_SECCOMP;
5122 return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5123 }
75689fb2 5124
24832d10
ILG
5125 if (keep_seccomp_privileges) {
5126 /* Restore the capability bounding set with what's expected from the service + the
5127 * ambient capabilities hack */
5128 if (!cap_test_all(saved_bset)) {
5129 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5130 if (r < 0) {
5131 *exit_status = EXIT_CAPABILITIES;
5132 return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5133 }
5134 }
5135
5136 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5137 * applications that use it. */
5138 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5139 r = drop_capability(CAP_SYS_ADMIN);
5140 if (r < 0) {
5141 *exit_status = EXIT_USER;
5142 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5143 }
5144 }
5145
5146 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5147 * applications that use it. */
5148 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5149 r = drop_capability(CAP_SETPCAP);
5150 if (r < 0) {
5151 *exit_status = EXIT_USER;
5152 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5153 }
5154 }
5155
5156 if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5157 *exit_status = EXIT_USER;
5158 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5159 }
75689fb2
LB
5160 }
5161#endif
5162
5163 }
5164
5165 if (!strv_isempty(context->unset_environment)) {
5166 char **ee = NULL;
5167
5168 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5169 if (!ee) {
5170 *exit_status = EXIT_MEMORY;
5171 return log_oom();
5172 }
5173
5174 strv_free_and_replace(accum_env, ee);
5175 }
5176
5177 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5178 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5179
5180 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5181 if (r < 0) {
5182 *exit_status = EXIT_MEMORY;
5183 return log_exec_error_errno(context,
5184 params,
5185 r,
5186 "Failed to replace environment variables: %m");
5187 }
5188 final_argv = replaced_argv;
5189
5190 if (!strv_isempty(unset_variables)) {
5191 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5192 log_exec_warning(context,
5193 params,
5194 "Referenced but unset environment variable evaluates to an empty string: %s",
5195 strna(ju));
5196 }
5197
5198 if (!strv_isempty(bad_variables)) {
5199 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5200 log_exec_warning(context,
5201 params,
5202 "Invalid environment variable name evaluates to an empty string: %s",
5203 strna(jb));
5204 }
5205 } else
5206 final_argv = command->argv;
5207
5208 log_command_line(context, params, "Executing", executable, final_argv);
5209
5210 if (exec_fd >= 0) {
5211 uint8_t hot = 1;
5212
5213 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5214 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5215
5216 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5217 *exit_status = EXIT_EXEC;
5218 return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5219 }
5220 }
5221
5222 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5223
5224 if (exec_fd >= 0) {
5225 uint8_t hot = 0;
5226
5227 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5228 * that POLLHUP on it no longer means execve() succeeded. */
5229
5230 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5231 *exit_status = EXIT_EXEC;
5232 return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5233 }
5234 }
5235
5236 *exit_status = EXIT_EXEC;
5237 return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5238}