]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/exec-invoke.c
Merge pull request #30284 from YHNdnzj/fstab-wantedby-defaultdeps
[thirdparty/systemd.git] / src / core / exec-invoke.c
CommitLineData
75689fb2
LB
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <sys/eventfd.h>
4#include <sys/ioctl.h>
5#include <sys/mount.h>
6#include <sys/prctl.h>
7
8#if HAVE_PAM
9#include <security/pam_appl.h>
10#include <security/pam_misc.h>
11#endif
12
13#if HAVE_APPARMOR
14#include <sys/apparmor.h>
15#endif
16
17#include "sd-messages.h"
18
19#if HAVE_APPARMOR
20#include "apparmor-util.h"
21#endif
22#include "argv-util.h"
23#include "barrier.h"
24#include "bpf-dlopen.h"
25#include "bpf-lsm.h"
26#include "btrfs-util.h"
27#include "capability-util.h"
28#include "cgroup-setup.h"
29#include "chase.h"
30#include "chattr-util.h"
31#include "chown-recursive.h"
32#include "copy.h"
33#include "data-fd-util.h"
34#include "env-util.h"
35#include "escape.h"
36#include "exec-credential.h"
37#include "exec-invoke.h"
38#include "execute.h"
39#include "exit-status.h"
40#include "fd-util.h"
41#include "hexdecoct.h"
42#include "io-util.h"
bd1ae178 43#include "iovec-util.h"
75689fb2
LB
44#include "missing_ioprio.h"
45#include "missing_prctl.h"
46#include "missing_securebits.h"
47#include "missing_syscall.h"
48#include "mkdir-label.h"
49#include "proc-cmdline.h"
50#include "process-util.h"
51#include "psi-util.h"
52#include "rlimit-util.h"
53#include "seccomp-util.h"
54#include "selinux-util.h"
55#include "signal-util.h"
56#include "smack-util.h"
57#include "socket-util.h"
58#include "string-table.h"
59#include "strv.h"
60#include "terminal-util.h"
61#include "utmp-wtmp.h"
62
63#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
64#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
65
66#define SNDBUF_SIZE (8*1024*1024)
67
68static int shift_fds(int fds[], size_t n_fds) {
69 if (n_fds <= 0)
70 return 0;
71
72 /* Modifies the fds array! (sorts it) */
73
74 assert(fds);
75
76 for (int start = 0;;) {
77 int restart_from = -1;
78
79 for (int i = start; i < (int) n_fds; i++) {
80 int nfd;
81
82 /* Already at right index? */
83 if (fds[i] == i+3)
84 continue;
85
86 nfd = fcntl(fds[i], F_DUPFD, i + 3);
87 if (nfd < 0)
88 return -errno;
89
90 safe_close(fds[i]);
91 fds[i] = nfd;
92
93 /* Hmm, the fd we wanted isn't free? Then
94 * let's remember that and try again from here */
95 if (nfd != i+3 && restart_from < 0)
96 restart_from = i;
97 }
98
99 if (restart_from < 0)
100 break;
101
102 start = restart_from;
103 }
104
105 return 0;
106}
107
d8da25b5 108static int flag_fds(
75689fb2
LB
109 const int fds[],
110 size_t n_socket_fds,
111 size_t n_fds,
112 bool nonblock) {
113
114 int r;
115
d8da25b5 116 assert(fds || n_fds == 0);
75689fb2
LB
117
118 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
119 * O_NONBLOCK only applies to socket activation though. */
120
121 for (size_t i = 0; i < n_fds; i++) {
122
123 if (i < n_socket_fds) {
124 r = fd_nonblock(fds[i], nonblock);
125 if (r < 0)
126 return r;
127 }
128
129 /* We unconditionally drop FD_CLOEXEC from the fds,
130 * since after all we want to pass these fds to our
131 * children */
132
133 r = fd_cloexec(fds[i], false);
134 if (r < 0)
135 return r;
136 }
137
138 return 0;
139}
140
141static bool is_terminal_input(ExecInput i) {
142 return IN_SET(i,
143 EXEC_INPUT_TTY,
144 EXEC_INPUT_TTY_FORCE,
145 EXEC_INPUT_TTY_FAIL);
146}
147
148static bool is_terminal_output(ExecOutput o) {
149 return IN_SET(o,
150 EXEC_OUTPUT_TTY,
151 EXEC_OUTPUT_KMSG_AND_CONSOLE,
152 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
153}
154
155static bool is_kmsg_output(ExecOutput o) {
156 return IN_SET(o,
157 EXEC_OUTPUT_KMSG,
158 EXEC_OUTPUT_KMSG_AND_CONSOLE);
159}
160
161static bool exec_context_needs_term(const ExecContext *c) {
162 assert(c);
163
164 /* Return true if the execution context suggests we should set $TERM to something useful. */
165
166 if (is_terminal_input(c->std_input))
167 return true;
168
169 if (is_terminal_output(c->std_output))
170 return true;
171
172 if (is_terminal_output(c->std_error))
173 return true;
174
175 return !!c->tty_path;
176}
177
178static int open_null_as(int flags, int nfd) {
179 int fd;
180
181 assert(nfd >= 0);
182
183 fd = open("/dev/null", flags|O_NOCTTY);
184 if (fd < 0)
185 return -errno;
186
187 return move_fd(fd, nfd, false);
188}
189
190static int connect_journal_socket(
191 int fd,
192 const char *log_namespace,
193 uid_t uid,
194 gid_t gid) {
195
196 uid_t olduid = UID_INVALID;
197 gid_t oldgid = GID_INVALID;
198 const char *j;
199 int r;
200
201 j = log_namespace ?
202 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
203 "/run/systemd/journal/stdout";
204
205 if (gid_is_valid(gid)) {
206 oldgid = getgid();
207
208 if (setegid(gid) < 0)
209 return -errno;
210 }
211
212 if (uid_is_valid(uid)) {
213 olduid = getuid();
214
215 if (seteuid(uid) < 0) {
216 r = -errno;
217 goto restore_gid;
218 }
219 }
220
221 r = connect_unix_path(fd, AT_FDCWD, j);
222
223 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
224 an LSM interferes. */
225
226 if (uid_is_valid(uid))
227 (void) seteuid(olduid);
228
229 restore_gid:
230 if (gid_is_valid(gid))
231 (void) setegid(oldgid);
232
233 return r;
234}
235
236static int connect_logger_as(
237 const ExecContext *context,
238 const ExecParameters *params,
239 ExecOutput output,
240 const char *ident,
241 int nfd,
242 uid_t uid,
243 gid_t gid) {
244
245 _cleanup_close_ int fd = -EBADF;
246 int r;
247
248 assert(context);
249 assert(params);
250 assert(output < _EXEC_OUTPUT_MAX);
251 assert(ident);
252 assert(nfd >= 0);
253
254 fd = socket(AF_UNIX, SOCK_STREAM, 0);
255 if (fd < 0)
256 return -errno;
257
258 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
259 if (r < 0)
260 return r;
261
262 if (shutdown(fd, SHUT_RD) < 0)
263 return -errno;
264
265 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
266
267 if (dprintf(fd,
268 "%s\n"
269 "%s\n"
270 "%i\n"
271 "%i\n"
272 "%i\n"
273 "%i\n"
274 "%i\n",
275 context->syslog_identifier ?: ident,
276 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
277 context->syslog_priority,
278 !!context->syslog_level_prefix,
279 false,
280 is_kmsg_output(output),
281 is_terminal_output(output)) < 0)
282 return -errno;
283
284 return move_fd(TAKE_FD(fd), nfd, false);
285}
286
287static int open_terminal_as(const char *path, int flags, int nfd) {
288 int fd;
289
290 assert(path);
291 assert(nfd >= 0);
292
293 fd = open_terminal(path, flags | O_NOCTTY);
294 if (fd < 0)
295 return fd;
296
297 return move_fd(fd, nfd, false);
298}
299
300static int acquire_path(const char *path, int flags, mode_t mode) {
301 _cleanup_close_ int fd = -EBADF;
302 int r;
303
304 assert(path);
305
306 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
307 flags |= O_CREAT;
308
309 fd = open(path, flags|O_NOCTTY, mode);
310 if (fd >= 0)
311 return TAKE_FD(fd);
312
313 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
314 return -errno;
315
316 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
317
318 fd = socket(AF_UNIX, SOCK_STREAM, 0);
319 if (fd < 0)
320 return -errno;
321
322 r = connect_unix_path(fd, AT_FDCWD, path);
323 if (IN_SET(r, -ENOTSOCK, -EINVAL))
324 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
325 * wasn't an AF_UNIX socket after all */
326 return -ENXIO;
327 if (r < 0)
328 return r;
329
330 if ((flags & O_ACCMODE) == O_RDONLY)
331 r = shutdown(fd, SHUT_WR);
332 else if ((flags & O_ACCMODE) == O_WRONLY)
333 r = shutdown(fd, SHUT_RD);
334 else
335 r = 0;
336 if (r < 0)
337 return -errno;
338
339 return TAKE_FD(fd);
340}
341
342static int fixup_input(
343 const ExecContext *context,
344 int socket_fd,
345 bool apply_tty_stdin) {
346
347 ExecInput std_input;
348
349 assert(context);
350
351 std_input = context->std_input;
352
353 if (is_terminal_input(std_input) && !apply_tty_stdin)
354 return EXEC_INPUT_NULL;
355
356 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
357 return EXEC_INPUT_NULL;
358
359 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
360 return EXEC_INPUT_NULL;
361
362 return std_input;
363}
364
365static int fixup_output(ExecOutput output, int socket_fd) {
366
367 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
368 return EXEC_OUTPUT_INHERIT;
369
370 return output;
371}
372
373static int setup_input(
374 const ExecContext *context,
375 const ExecParameters *params,
376 int socket_fd,
377 const int named_iofds[static 3]) {
378
379 ExecInput i;
380 int r;
381
382 assert(context);
383 assert(params);
384 assert(named_iofds);
385
386 if (params->stdin_fd >= 0) {
387 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
388 return -errno;
389
390 /* Try to make this the controlling tty, if it is a tty, and reset it */
391 if (isatty(STDIN_FILENO)) {
75689fb2 392 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
bb2dbe7c
LP
393
394 if (context->tty_reset)
395 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
396
d2b9e755 397 (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
75689fb2
LB
398 }
399
400 return STDIN_FILENO;
401 }
402
403 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
404
405 switch (i) {
406
407 case EXEC_INPUT_NULL:
408 return open_null_as(O_RDONLY, STDIN_FILENO);
409
410 case EXEC_INPUT_TTY:
411 case EXEC_INPUT_TTY_FORCE:
412 case EXEC_INPUT_TTY_FAIL: {
d2b9e755
LP
413 _cleanup_close_ int tty_fd = -EBADF;
414 const char *tty_path;
75689fb2 415
d2b9e755
LP
416 tty_path = ASSERT_PTR(exec_context_tty_path(context));
417
418 tty_fd = acquire_terminal(tty_path,
419 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
420 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
421 ACQUIRE_TERMINAL_WAIT,
422 USEC_INFINITY);
423 if (tty_fd < 0)
424 return tty_fd;
75689fb2 425
d2b9e755 426 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
75689fb2
LB
427 if (r < 0)
428 return r;
429
d2b9e755 430 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
75689fb2
LB
431 if (r < 0)
432 return r;
433
d2b9e755
LP
434 TAKE_FD(tty_fd);
435 return r;
75689fb2
LB
436 }
437
438 case EXEC_INPUT_SOCKET:
439 assert(socket_fd >= 0);
440
441 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
442
443 case EXEC_INPUT_NAMED_FD:
444 assert(named_iofds[STDIN_FILENO] >= 0);
445
446 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
447 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
448
449 case EXEC_INPUT_DATA: {
450 int fd;
451
452 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
453 if (fd < 0)
454 return fd;
455
456 return move_fd(fd, STDIN_FILENO, false);
457 }
458
459 case EXEC_INPUT_FILE: {
460 bool rw;
461 int fd;
462
463 assert(context->stdio_file[STDIN_FILENO]);
464
465 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
466 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
467
468 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
469 if (fd < 0)
470 return fd;
471
472 return move_fd(fd, STDIN_FILENO, false);
473 }
474
475 default:
476 assert_not_reached();
477 }
478}
479
480static bool can_inherit_stderr_from_stdout(
481 const ExecContext *context,
482 ExecOutput o,
483 ExecOutput e) {
484
485 assert(context);
486
487 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
488 * stderr fd */
489
490 if (e == EXEC_OUTPUT_INHERIT)
491 return true;
492 if (e != o)
493 return false;
494
495 if (e == EXEC_OUTPUT_NAMED_FD)
496 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
497
498 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
499 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
500
501 return true;
502}
503
504static int setup_output(
505 const ExecContext *context,
506 const ExecParameters *params,
507 int fileno,
508 int socket_fd,
509 const int named_iofds[static 3],
510 const char *ident,
511 uid_t uid,
512 gid_t gid,
513 dev_t *journal_stream_dev,
514 ino_t *journal_stream_ino) {
515
516 ExecOutput o;
517 ExecInput i;
518 int r;
519
520 assert(context);
521 assert(params);
522 assert(ident);
523 assert(journal_stream_dev);
524 assert(journal_stream_ino);
525
526 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
527
528 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
529 return -errno;
530
531 return STDOUT_FILENO;
532 }
533
534 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
535 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
536 return -errno;
537
538 return STDERR_FILENO;
539 }
540
541 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
542 o = fixup_output(context->std_output, socket_fd);
543
544 if (fileno == STDERR_FILENO) {
545 ExecOutput e;
546 e = fixup_output(context->std_error, socket_fd);
547
548 /* This expects the input and output are already set up */
549
550 /* Don't change the stderr file descriptor if we inherit all
551 * the way and are not on a tty */
552 if (e == EXEC_OUTPUT_INHERIT &&
553 o == EXEC_OUTPUT_INHERIT &&
554 i == EXEC_INPUT_NULL &&
555 !is_terminal_input(context->std_input) &&
556 getppid() != 1)
557 return fileno;
558
559 /* Duplicate from stdout if possible */
560 if (can_inherit_stderr_from_stdout(context, o, e))
561 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
562
563 o = e;
564
565 } else if (o == EXEC_OUTPUT_INHERIT) {
566 /* If input got downgraded, inherit the original value */
567 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
568 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
569
570 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
571 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
572 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
573
574 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
575 if (getppid() != 1)
576 return fileno;
577
578 /* We need to open /dev/null here anew, to get the right access mode. */
579 return open_null_as(O_WRONLY, fileno);
580 }
581
582 switch (o) {
583
584 case EXEC_OUTPUT_NULL:
585 return open_null_as(O_WRONLY, fileno);
586
587 case EXEC_OUTPUT_TTY:
588 if (is_terminal_input(i))
589 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
590
591 /* We don't reset the terminal if this is just about output */
592 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
593
594 case EXEC_OUTPUT_KMSG:
595 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
596 case EXEC_OUTPUT_JOURNAL:
597 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
598 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
599 if (r < 0) {
600 log_exec_warning_errno(context,
601 params,
602 r,
603 "Failed to connect %s to the journal socket, ignoring: %m",
604 fileno == STDOUT_FILENO ? "stdout" : "stderr");
605 r = open_null_as(O_WRONLY, fileno);
606 } else {
607 struct stat st;
608
609 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
610 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
611 * services to detect whether they are connected to the journal or not.
612 *
613 * If both stdout and stderr are connected to a stream then let's make sure to store the data
614 * about STDERR as that's usually the best way to do logging. */
615
616 if (fstat(fileno, &st) >= 0 &&
617 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
618 *journal_stream_dev = st.st_dev;
619 *journal_stream_ino = st.st_ino;
620 }
621 }
622 return r;
623
624 case EXEC_OUTPUT_SOCKET:
625 assert(socket_fd >= 0);
626
627 return RET_NERRNO(dup2(socket_fd, fileno));
628
629 case EXEC_OUTPUT_NAMED_FD:
630 assert(named_iofds[fileno] >= 0);
631
632 (void) fd_nonblock(named_iofds[fileno], false);
633 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
634
635 case EXEC_OUTPUT_FILE:
636 case EXEC_OUTPUT_FILE_APPEND:
637 case EXEC_OUTPUT_FILE_TRUNCATE: {
638 bool rw;
639 int fd, flags;
640
641 assert(context->stdio_file[fileno]);
642
643 rw = context->std_input == EXEC_INPUT_FILE &&
644 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
645
646 if (rw)
647 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
648
649 flags = O_WRONLY;
650 if (o == EXEC_OUTPUT_FILE_APPEND)
651 flags |= O_APPEND;
652 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
653 flags |= O_TRUNC;
654
655 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
656 if (fd < 0)
657 return fd;
658
659 return move_fd(fd, fileno, 0);
660 }
661
662 default:
663 assert_not_reached();
664 }
665}
666
667static int chown_terminal(int fd, uid_t uid) {
668 int r;
669
670 assert(fd >= 0);
671
672 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
673 if (isatty(fd) < 1) {
674 if (IN_SET(errno, EINVAL, ENOTTY))
675 return 0; /* not a tty */
676
677 return -errno;
678 }
679
680 /* This might fail. What matters are the results. */
681 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
682 if (r < 0)
683 return r;
684
685 return 1;
686}
687
688static int setup_confirm_stdio(
689 const ExecContext *context,
690 const char *vc,
691 int *ret_saved_stdin,
692 int *ret_saved_stdout) {
693
694 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
75689fb2
LB
695 int r;
696
697 assert(ret_saved_stdin);
698 assert(ret_saved_stdout);
699
700 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
701 if (saved_stdin < 0)
702 return -errno;
703
704 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
705 if (saved_stdout < 0)
706 return -errno;
707
708 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
709 if (fd < 0)
710 return fd;
711
712 r = chown_terminal(fd, getuid());
713 if (r < 0)
714 return r;
715
29ed1f02 716 r = reset_terminal_fd(fd, /* switch_to_text= */ true);
75689fb2
LB
717 if (r < 0)
718 return r;
719
d2b9e755 720 r = exec_context_apply_tty_size(context, fd, vc);
75689fb2
LB
721 if (r < 0)
722 return r;
723
724 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
725 TAKE_FD(fd);
726 if (r < 0)
727 return r;
728
729 *ret_saved_stdin = TAKE_FD(saved_stdin);
730 *ret_saved_stdout = TAKE_FD(saved_stdout);
731 return 0;
732}
733
734static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
735 assert(err < 0);
736 assert(unit_id);
737
738 if (err == -ETIMEDOUT)
739 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
740 else {
741 errno = -err;
742 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
743 }
744}
745
746static void write_confirm_error(int err, const char *vc, const char *unit_id) {
747 _cleanup_close_ int fd = -EBADF;
748
749 assert(vc);
750
751 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
752 if (fd < 0)
753 return;
754
755 write_confirm_error_fd(err, fd, unit_id);
756}
757
758static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
759 int r = 0;
760
761 assert(saved_stdin);
762 assert(saved_stdout);
763
764 release_terminal();
765
766 if (*saved_stdin >= 0)
767 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
768 r = -errno;
769
770 if (*saved_stdout >= 0)
771 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
772 r = -errno;
773
774 *saved_stdin = safe_close(*saved_stdin);
775 *saved_stdout = safe_close(*saved_stdout);
776
777 return r;
778}
779
780enum {
781 CONFIRM_PRETEND_FAILURE = -1,
782 CONFIRM_PRETEND_SUCCESS = 0,
783 CONFIRM_EXECUTE = 1,
784};
785
786static bool confirm_spawn_disabled(void) {
787 return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
788}
789
790static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
791 int saved_stdout = -1, saved_stdin = -1, r;
792 _cleanup_free_ char *e = NULL;
793 char c;
794
795 assert(context);
796 assert(params);
797
798 /* For any internal errors, assume a positive response. */
799 r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
800 if (r < 0) {
801 write_confirm_error(r, params->confirm_spawn, params->unit_id);
802 return CONFIRM_EXECUTE;
803 }
804
805 /* confirm_spawn might have been disabled while we were sleeping. */
806 if (!params->confirm_spawn || confirm_spawn_disabled()) {
807 r = 1;
808 goto restore_stdio;
809 }
810
811 e = ellipsize(cmdline, 60, 100);
812 if (!e) {
813 log_oom();
814 r = CONFIRM_EXECUTE;
815 goto restore_stdio;
816 }
817
818 for (;;) {
819 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
820 if (r < 0) {
821 write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
822 r = CONFIRM_EXECUTE;
823 goto restore_stdio;
824 }
825
826 switch (c) {
827 case 'c':
828 printf("Resuming normal execution.\n");
829 manager_disable_confirm_spawn();
830 r = 1;
831 break;
832 case 'D':
833 printf(" Unit: %s\n",
834 params->unit_id);
835 exec_context_dump(context, stdout, " ");
836 exec_params_dump(params, stdout, " ");
837 continue; /* ask again */
838 case 'f':
839 printf("Failing execution.\n");
840 r = CONFIRM_PRETEND_FAILURE;
841 break;
842 case 'h':
843 printf(" c - continue, proceed without asking anymore\n"
844 " D - dump, show the state of the unit\n"
845 " f - fail, don't execute the command and pretend it failed\n"
846 " h - help\n"
847 " i - info, show a short summary of the unit\n"
848 " j - jobs, show jobs that are in progress\n"
849 " s - skip, don't execute the command and pretend it succeeded\n"
850 " y - yes, execute the command\n");
851 continue; /* ask again */
852 case 'i':
853 printf(" Unit: %s\n"
854 " Command: %s\n",
855 params->unit_id, cmdline);
856 continue; /* ask again */
857 case 'j':
858 if (sigqueue(getppid(),
859 SIGRTMIN+18,
860 (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
861 return -errno;
862
863 continue; /* ask again */
864 case 'n':
865 /* 'n' was removed in favor of 'f'. */
866 printf("Didn't understand 'n', did you mean 'f'?\n");
867 continue; /* ask again */
868 case 's':
869 printf("Skipping execution.\n");
870 r = CONFIRM_PRETEND_SUCCESS;
871 break;
872 case 'y':
873 r = CONFIRM_EXECUTE;
874 break;
875 default:
876 assert_not_reached();
877 }
878 break;
879 }
880
881restore_stdio:
882 restore_confirm_stdio(&saved_stdin, &saved_stdout);
883 return r;
884}
885
886static int get_fixed_user(
8d85efae
MY
887 const char *user_or_uid,
888 const char **ret_username,
75689fb2
LB
889 uid_t *ret_uid,
890 gid_t *ret_gid,
891 const char **ret_home,
892 const char **ret_shell) {
893
894 int r;
895
8d85efae
MY
896 assert(user_or_uid);
897 assert(ret_username);
75689fb2
LB
898
899 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
900 * (i.e. are "/" or "/bin/nologin"). */
901
8d85efae 902 r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
75689fb2
LB
903 if (r < 0)
904 return r;
905
8d85efae
MY
906 /* user_or_uid is normalized by get_user_creds to username */
907 *ret_username = user_or_uid;
908
75689fb2
LB
909 return 0;
910}
911
912static int get_fixed_group(
8d85efae
MY
913 const char *group_or_gid,
914 const char **ret_groupname,
75689fb2
LB
915 gid_t *ret_gid) {
916
917 int r;
918
8d85efae
MY
919 assert(group_or_gid);
920 assert(ret_groupname);
75689fb2 921
8d85efae 922 r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
75689fb2
LB
923 if (r < 0)
924 return r;
925
8d85efae
MY
926 /* group_or_gid is normalized by get_group_creds to groupname */
927 *ret_groupname = group_or_gid;
928
75689fb2
LB
929 return 0;
930}
931
932static int get_supplementary_groups(const ExecContext *c, const char *user,
933 const char *group, gid_t gid,
934 gid_t **supplementary_gids, int *ngids) {
935 int r, k = 0;
936 int ngroups_max;
937 bool keep_groups = false;
938 gid_t *groups = NULL;
939 _cleanup_free_ gid_t *l_gids = NULL;
940
941 assert(c);
942
943 /*
944 * If user is given, then lookup GID and supplementary groups list.
945 * We avoid NSS lookups for gid=0. Also we have to initialize groups
946 * here and as early as possible so we keep the list of supplementary
947 * groups of the caller.
948 */
949 if (user && gid_is_valid(gid) && gid != 0) {
950 /* First step, initialize groups from /etc/groups */
951 if (initgroups(user, gid) < 0)
952 return -errno;
953
954 keep_groups = true;
955 }
956
957 if (strv_isempty(c->supplementary_groups))
958 return 0;
959
960 /*
961 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
962 * be positive, otherwise fail.
963 */
964 errno = 0;
965 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
966 if (ngroups_max <= 0)
967 return errno_or_else(EOPNOTSUPP);
968
969 l_gids = new(gid_t, ngroups_max);
970 if (!l_gids)
971 return -ENOMEM;
972
973 if (keep_groups) {
974 /*
975 * Lookup the list of groups that the user belongs to, we
976 * avoid NSS lookups here too for gid=0.
977 */
978 k = ngroups_max;
979 if (getgrouplist(user, gid, l_gids, &k) < 0)
980 return -EINVAL;
981 } else
982 k = 0;
983
984 STRV_FOREACH(i, c->supplementary_groups) {
985 const char *g;
986
987 if (k >= ngroups_max)
988 return -E2BIG;
989
990 g = *i;
991 r = get_group_creds(&g, l_gids+k, 0);
992 if (r < 0)
993 return r;
994
995 k++;
996 }
997
998 /*
999 * Sets ngids to zero to drop all supplementary groups, happens
1000 * when we are under root and SupplementaryGroups= is empty.
1001 */
1002 if (k == 0) {
1003 *ngids = 0;
1004 return 0;
1005 }
1006
1007 /* Otherwise get the final list of supplementary groups */
1008 groups = memdup(l_gids, sizeof(gid_t) * k);
1009 if (!groups)
1010 return -ENOMEM;
1011
1012 *supplementary_gids = groups;
1013 *ngids = k;
1014
1015 groups = NULL;
1016
1017 return 0;
1018}
1019
1020static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1021 int r;
1022
1023 /* Handle SupplementaryGroups= if it is not empty */
1024 if (ngids > 0) {
1025 r = maybe_setgroups(ngids, supplementary_gids);
1026 if (r < 0)
1027 return r;
1028 }
1029
1030 if (gid_is_valid(gid)) {
1031 /* Then set our gids */
1032 if (setresgid(gid, gid, gid) < 0)
1033 return -errno;
1034 }
1035
1036 return 0;
1037}
1038
1039static int set_securebits(unsigned bits, unsigned mask) {
1040 unsigned applied;
1041 int current;
1042
1043 current = prctl(PR_GET_SECUREBITS);
1044 if (current < 0)
1045 return -errno;
1046
1047 /* Clear all securebits defined in mask and set bits */
1048 applied = ((unsigned) current & ~mask) | bits;
1049 if ((unsigned) current == applied)
1050 return 0;
1051
1052 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1053 return -errno;
1054
1055 return 1;
1056}
1057
1058static int enforce_user(
1059 const ExecContext *context,
1060 uid_t uid,
1061 uint64_t capability_ambient_set) {
1062 assert(context);
1063 int r;
1064
1065 if (!uid_is_valid(uid))
1066 return 0;
1067
1068 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1069 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1070 * case. */
1071
1072 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1073
1074 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1075 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1076 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1077 if (r < 0)
1078 return r;
1079 }
1080
1081 /* Second step: actually set the uids */
1082 if (setresuid(uid, uid, uid) < 0)
1083 return -errno;
1084
1085 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1086 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1087 * outside of this call. */
1088 return 0;
1089}
1090
1091#if HAVE_PAM
1092
1093static int null_conv(
1094 int num_msg,
1095 const struct pam_message **msg,
1096 struct pam_response **resp,
1097 void *appdata_ptr) {
1098
1099 /* We don't support conversations */
1100
1101 return PAM_CONV_ERR;
1102}
1103
1104#endif
1105
1106static int setup_pam(
1107 const char *name,
1108 const char *user,
1109 uid_t uid,
1110 gid_t gid,
1111 const char *tty,
1112 char ***env, /* updated on success */
1113 const int fds[], size_t n_fds) {
1114
1115#if HAVE_PAM
1116
1117 static const struct pam_conv conv = {
1118 .conv = null_conv,
1119 .appdata_ptr = NULL
1120 };
1121
1122 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1123 _cleanup_strv_free_ char **e = NULL;
1124 pam_handle_t *handle = NULL;
1125 sigset_t old_ss;
1126 int pam_code = PAM_SUCCESS, r;
1127 bool close_session = false;
1128 pid_t pam_pid = 0, parent_pid;
1129 int flags = 0;
1130
1131 assert(name);
1132 assert(user);
1133 assert(env);
1134
1135 /* We set up PAM in the parent process, then fork. The child
1136 * will then stay around until killed via PR_GET_PDEATHSIG or
1137 * systemd via the cgroup logic. It will then remove the PAM
1138 * session again. The parent process will exec() the actual
1139 * daemon. We do things this way to ensure that the main PID
1140 * of the daemon is the one we initially fork()ed. */
1141
1142 r = barrier_create(&barrier);
1143 if (r < 0)
1144 goto fail;
1145
1146 if (log_get_max_level() < LOG_DEBUG)
1147 flags |= PAM_SILENT;
1148
1149 pam_code = pam_start(name, user, &conv, &handle);
1150 if (pam_code != PAM_SUCCESS) {
1151 handle = NULL;
1152 goto fail;
1153 }
1154
1155 if (!tty) {
1156 _cleanup_free_ char *q = NULL;
1157
1158 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1159 * out if that's the case, and read the TTY off it. */
1160
1161 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1162 tty = strjoina("/dev/", q);
1163 }
1164
1165 if (tty) {
1166 pam_code = pam_set_item(handle, PAM_TTY, tty);
1167 if (pam_code != PAM_SUCCESS)
1168 goto fail;
1169 }
1170
1171 STRV_FOREACH(nv, *env) {
1172 pam_code = pam_putenv(handle, *nv);
1173 if (pam_code != PAM_SUCCESS)
1174 goto fail;
1175 }
1176
1177 pam_code = pam_acct_mgmt(handle, flags);
1178 if (pam_code != PAM_SUCCESS)
1179 goto fail;
1180
1181 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1182 if (pam_code != PAM_SUCCESS)
1183 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1184
1185 pam_code = pam_open_session(handle, flags);
1186 if (pam_code != PAM_SUCCESS)
1187 goto fail;
1188
1189 close_session = true;
1190
1191 e = pam_getenvlist(handle);
1192 if (!e) {
1193 pam_code = PAM_BUF_ERR;
1194 goto fail;
1195 }
1196
1197 /* Block SIGTERM, so that we know that it won't get lost in the child */
1198
1199 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1200
1201 parent_pid = getpid_cached();
1202
1203 r = safe_fork("(sd-pam)", 0, &pam_pid);
1204 if (r < 0)
1205 goto fail;
1206 if (r == 0) {
c8f7c9a1 1207 int ret = EXIT_PAM;
75689fb2
LB
1208
1209 /* The child's job is to reset the PAM session on termination */
1210 barrier_set_role(&barrier, BARRIER_CHILD);
1211
1212 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1213 * those fds are open here that have been opened by PAM. */
1214 (void) close_many(fds, n_fds);
1215
1216 /* Drop privileges - we don't need any to pam_close_session and this will make
1217 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1218 * threads to fail to exit normally */
1219
6498a0c2 1220 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
75689fb2 1221 if (r < 0)
6498a0c2 1222 log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
75689fb2
LB
1223
1224 (void) ignore_signals(SIGPIPE);
1225
1226 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1227 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1228 * this way. We rely on the control groups kill logic to do the rest for us. */
1229 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1230 goto child_finish;
1231
1232 /* Tell the parent that our setup is done. This is especially important regarding dropping
1233 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1234 *
1235 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1236 (void) barrier_place(&barrier);
1237
1238 /* Check if our parent process might already have died? */
1239 if (getppid() == parent_pid) {
1240 sigset_t ss;
c8f7c9a1 1241 int sig;
75689fb2
LB
1242
1243 assert_se(sigemptyset(&ss) >= 0);
1244 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1245
c8f7c9a1
MY
1246 assert_se(sigwait(&ss, &sig) == 0);
1247 assert(sig == SIGTERM);
75689fb2
LB
1248 }
1249
1250 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1251 if (pam_code != PAM_SUCCESS)
1252 goto child_finish;
1253
1254 /* If our parent died we'll end the session */
1255 if (getppid() != parent_pid) {
1256 pam_code = pam_close_session(handle, flags);
1257 if (pam_code != PAM_SUCCESS)
1258 goto child_finish;
1259 }
1260
1261 ret = 0;
1262
1263 child_finish:
1264 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1265 * know about this. See pam_end(3) */
1266 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1267 _exit(ret);
1268 }
1269
1270 barrier_set_role(&barrier, BARRIER_PARENT);
1271
1272 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1273 * here. */
1274 handle = NULL;
1275
1276 /* Unblock SIGTERM again in the parent */
1277 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1278
1279 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1280 * this fd around. */
1281 closelog();
1282
1283 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1284 * recover. However, warn loudly if it happens. */
1285 if (!barrier_place_and_sync(&barrier))
1286 log_error("PAM initialization failed");
1287
1288 return strv_free_and_replace(*env, e);
1289
1290fail:
1291 if (pam_code != PAM_SUCCESS) {
1292 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1293 r = -EPERM; /* PAM errors do not map to errno */
1294 } else
1295 log_error_errno(r, "PAM failed: %m");
1296
1297 if (handle) {
1298 if (close_session)
1299 pam_code = pam_close_session(handle, flags);
1300
1301 (void) pam_end(handle, pam_code | flags);
1302 }
1303
1304 closelog();
1305 return r;
1306#else
1307 return 0;
1308#endif
1309}
1310
1311static void rename_process_from_path(const char *path) {
1312 _cleanup_free_ char *buf = NULL;
1313 const char *p;
1314
1315 assert(path);
1316
1317 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1318 * /bin/ps */
1319
1320 if (path_extract_filename(path, &buf) < 0) {
1321 rename_process("(...)");
1322 return;
1323 }
1324
1325 size_t l = strlen(buf);
1326 if (l > 8) {
1327 /* The end of the process name is usually more interesting, since the first bit might just be
1328 * "systemd-" */
1329 p = buf + l - 8;
1330 l = 8;
1331 } else
1332 p = buf;
1333
1334 char process_name[11];
1335 process_name[0] = '(';
1336 memcpy(process_name+1, p, l);
1337 process_name[1+l] = ')';
1338 process_name[1+l+1] = 0;
1339
c0e82e3a 1340 (void) rename_process(process_name);
75689fb2
LB
1341}
1342
1343static bool context_has_address_families(const ExecContext *c) {
1344 assert(c);
1345
1346 return c->address_families_allow_list ||
1347 !set_isempty(c->address_families);
1348}
1349
1350static bool context_has_syscall_filters(const ExecContext *c) {
1351 assert(c);
1352
1353 return c->syscall_allow_list ||
1354 !hashmap_isempty(c->syscall_filter);
1355}
1356
1357static bool context_has_syscall_logs(const ExecContext *c) {
1358 assert(c);
1359
1360 return c->syscall_log_allow_list ||
1361 !hashmap_isempty(c->syscall_log);
1362}
1363
24832d10 1364static bool context_has_seccomp(const ExecContext *c) {
75689fb2
LB
1365 /* We need NNP if we have any form of seccomp and are unprivileged */
1366 return c->lock_personality ||
1367 c->memory_deny_write_execute ||
1368 c->private_devices ||
1369 c->protect_clock ||
1370 c->protect_hostname ||
1371 c->protect_kernel_tunables ||
1372 c->protect_kernel_modules ||
1373 c->protect_kernel_logs ||
1374 context_has_address_families(c) ||
1375 exec_context_restrict_namespaces_set(c) ||
1376 c->restrict_realtime ||
1377 c->restrict_suid_sgid ||
1378 !set_isempty(c->syscall_archs) ||
1379 context_has_syscall_filters(c) ||
1380 context_has_syscall_logs(c);
1381}
1382
24832d10
ILG
1383static bool context_has_no_new_privileges(const ExecContext *c) {
1384 assert(c);
1385
1386 if (c->no_new_privileges)
1387 return true;
1388
1389 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1390 return false;
1391
1392 return context_has_seccomp(c);
1393}
1394
75689fb2
LB
1395#if HAVE_SECCOMP
1396
24832d10
ILG
1397static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1398 void *id, *val;
1399 bool has_capget = false, has_capset = false, has_prctl = false;
1400
1401 assert(c);
1402
1403 /* No syscall filter, we are allowed to drop privileges */
1404 if (hashmap_isempty(c->syscall_filter))
1405 return true;
1406
1407 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1408 _cleanup_free_ char *name = NULL;
1409
1410 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1411
1412 if (streq(name, "capget"))
1413 has_capget = true;
1414 else if (streq(name, "capset"))
1415 has_capset = true;
1416 else if (streq(name, "prctl"))
1417 has_prctl = true;
1418 }
1419
1420 if (c->syscall_allow_list)
1421 return has_capget && has_capset && has_prctl;
1422 else
1423 return !(has_capget || has_capset || has_prctl);
1424}
1425
75689fb2
LB
1426static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1427
1428 if (is_seccomp_available())
1429 return false;
1430
1431 log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1432 return true;
1433}
1434
1435static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1436 uint32_t negative_action, default_action, action;
1437 int r;
1438
1439 assert(c);
1440 assert(p);
1441
1442 if (!context_has_syscall_filters(c))
1443 return 0;
1444
1445 if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1446 return 0;
1447
1448 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1449
1450 if (c->syscall_allow_list) {
1451 default_action = negative_action;
1452 action = SCMP_ACT_ALLOW;
1453 } else {
1454 default_action = SCMP_ACT_ALLOW;
1455 action = negative_action;
1456 }
1457
1458 if (needs_ambient_hack) {
1459 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1460 if (r < 0)
1461 return r;
1462 }
1463
1464 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1465}
1466
1467static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1468#ifdef SCMP_ACT_LOG
1469 uint32_t default_action, action;
1470#endif
1471
1472 assert(c);
1473 assert(p);
1474
1475 if (!context_has_syscall_logs(c))
1476 return 0;
1477
1478#ifdef SCMP_ACT_LOG
1479 if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1480 return 0;
1481
1482 if (c->syscall_log_allow_list) {
1483 /* Log nothing but the ones listed */
1484 default_action = SCMP_ACT_ALLOW;
1485 action = SCMP_ACT_LOG;
1486 } else {
1487 /* Log everything but the ones listed */
1488 default_action = SCMP_ACT_LOG;
1489 action = SCMP_ACT_ALLOW;
1490 }
1491
1492 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1493#else
1494 /* old libseccomp */
1495 log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1496 return 0;
1497#endif
1498}
1499
1500static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1501 assert(c);
1502 assert(p);
1503
1504 if (set_isempty(c->syscall_archs))
1505 return 0;
1506
1507 if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1508 return 0;
1509
1510 return seccomp_restrict_archs(c->syscall_archs);
1511}
1512
1513static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1514 assert(c);
1515 assert(p);
1516
1517 if (!context_has_address_families(c))
1518 return 0;
1519
1520 if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1521 return 0;
1522
1523 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1524}
1525
1526static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1527 int r;
1528
1529 assert(c);
1530 assert(p);
1531
1532 if (!c->memory_deny_write_execute)
1533 return 0;
1534
1535 /* use prctl() if kernel supports it (6.3) */
1536 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1537 if (r == 0) {
1538 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1539 return 0;
1540 }
1541 if (r < 0 && errno != EINVAL)
1542 return log_exec_debug_errno(c,
1543 p,
1544 errno,
1545 "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1546 /* else use seccomp */
1547 log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1548
1549 if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1550 return 0;
1551
1552 return seccomp_memory_deny_write_execute();
1553}
1554
1555static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1556 assert(c);
1557 assert(p);
1558
1559 if (!c->restrict_realtime)
1560 return 0;
1561
1562 if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1563 return 0;
1564
1565 return seccomp_restrict_realtime();
1566}
1567
1568static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1569 assert(c);
1570 assert(p);
1571
1572 if (!c->restrict_suid_sgid)
1573 return 0;
1574
1575 if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1576 return 0;
1577
1578 return seccomp_restrict_suid_sgid();
1579}
1580
1581static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1582 assert(c);
1583 assert(p);
1584
1585 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1586 * let's protect even those systems where this is left on in the kernel. */
1587
1588 if (!c->protect_kernel_tunables)
1589 return 0;
1590
1591 if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1592 return 0;
1593
1594 return seccomp_protect_sysctl();
1595}
1596
1597static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1598 assert(c);
1599 assert(p);
1600
1601 /* Turn off module syscalls on ProtectKernelModules=yes */
1602
1603 if (!c->protect_kernel_modules)
1604 return 0;
1605
1606 if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1607 return 0;
1608
1609 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1610}
1611
1612static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1613 assert(c);
1614 assert(p);
1615
1616 if (!c->protect_kernel_logs)
1617 return 0;
1618
1619 if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1620 return 0;
1621
1622 return seccomp_protect_syslog();
1623}
1624
1625static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1626 assert(c);
1627 assert(p);
1628
1629 if (!c->protect_clock)
1630 return 0;
1631
1632 if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1633 return 0;
1634
1635 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1636}
1637
1638static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1639 assert(c);
1640 assert(p);
1641
1642 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1643
1644 if (!c->private_devices)
1645 return 0;
1646
1647 if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1648 return 0;
1649
1650 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1651}
1652
1653static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1654 assert(c);
1655 assert(p);
1656
1657 if (!exec_context_restrict_namespaces_set(c))
1658 return 0;
1659
1660 if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1661 return 0;
1662
1663 return seccomp_restrict_namespaces(c->restrict_namespaces);
1664}
1665
1666static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1667 unsigned long personality;
1668 int r;
1669
1670 assert(c);
1671 assert(p);
1672
1673 if (!c->lock_personality)
1674 return 0;
1675
1676 if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1677 return 0;
1678
1679 personality = c->personality;
1680
1681 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1682 if (personality == PERSONALITY_INVALID) {
1683
1684 r = opinionated_personality(&personality);
1685 if (r < 0)
1686 return r;
1687 }
1688
1689 return seccomp_lock_personality(personality);
1690}
1691
1692#endif
1693
1694#if HAVE_LIBBPF
1695static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1696 int r;
1697
1698 assert(c);
1699 assert(p);
1700
1701 if (!exec_context_restrict_filesystems_set(c))
1702 return 0;
1703
1704 if (p->bpf_outer_map_fd < 0) {
1705 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1706 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1707 return 0;
1708 }
1709
1710 /* We are in a new binary, so dl-open again */
1711 r = dlopen_bpf();
1712 if (r < 0)
1713 return r;
1714
1715 return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
1716}
1717#endif
1718
1719static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1720 assert(c);
1721 assert(p);
1722
1723 if (!c->protect_hostname)
1724 return 0;
1725
1726 if (ns_type_supported(NAMESPACE_UTS)) {
1727 if (unshare(CLONE_NEWUTS) < 0) {
1728 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1729 *ret_exit_status = EXIT_NAMESPACE;
1730 return log_exec_error_errno(c,
1731 p,
1732 errno,
1733 "Failed to set up UTS namespacing: %m");
1734 }
1735
1736 log_exec_warning(c,
1737 p,
1738 "ProtectHostname=yes is configured, but UTS namespace setup is "
1739 "prohibited (container manager?), ignoring namespace setup.");
1740 }
1741 } else
1742 log_exec_warning(c,
1743 p,
1744 "ProtectHostname=yes is configured, but the kernel does not "
1745 "support UTS namespaces, ignoring namespace setup.");
1746
1747#if HAVE_SECCOMP
1748 int r;
1749
1750 if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1751 return 0;
1752
1753 r = seccomp_protect_hostname();
1754 if (r < 0) {
1755 *ret_exit_status = EXIT_SECCOMP;
1756 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1757 }
1758#endif
1759
1760 return 0;
1761}
1762
1763static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1764 assert(idle_pipe);
1765
1766 idle_pipe[1] = safe_close(idle_pipe[1]);
1767 idle_pipe[2] = safe_close(idle_pipe[2]);
1768
1769 if (idle_pipe[0] >= 0) {
1770 int r;
1771
1772 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1773
1774 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1775 ssize_t n;
1776
1777 /* Signal systemd that we are bored and want to continue. */
1778 n = write(idle_pipe[3], "x", 1);
1779 if (n > 0)
1780 /* Wait for systemd to react to the signal above. */
1781 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1782 }
1783
1784 idle_pipe[0] = safe_close(idle_pipe[0]);
1785
1786 }
1787
1788 idle_pipe[3] = safe_close(idle_pipe[3]);
1789}
1790
1791static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1792
1793/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1794 * the service payload in. */
1795static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1796 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1797 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1798 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1799 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1800 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1801};
1802
1803DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1804
1805static int build_environment(
1806 const ExecContext *c,
1807 const ExecParameters *p,
1808 const CGroupContext *cgroup_context,
1809 size_t n_fds,
75689fb2
LB
1810 const char *home,
1811 const char *username,
1812 const char *shell,
1813 dev_t journal_stream_dev,
1814 ino_t journal_stream_ino,
1815 const char *memory_pressure_path,
1816 char ***ret) {
1817
1818 _cleanup_strv_free_ char **our_env = NULL;
1819 size_t n_env = 0;
1820 char *x;
1821 int r;
1822
1823 assert(c);
1824 assert(p);
1825 assert(ret);
1826
1827#define N_ENV_VARS 19
1828 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1829 if (!our_env)
1830 return -ENOMEM;
1831
1832 if (n_fds > 0) {
1833 _cleanup_free_ char *joined = NULL;
1834
1835 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1836 return -ENOMEM;
1837 our_env[n_env++] = x;
1838
1839 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1840 return -ENOMEM;
1841 our_env[n_env++] = x;
1842
1eeaa93d 1843 joined = strv_join(p->fd_names, ":");
75689fb2
LB
1844 if (!joined)
1845 return -ENOMEM;
1846
1847 x = strjoin("LISTEN_FDNAMES=", joined);
1848 if (!x)
1849 return -ENOMEM;
1850 our_env[n_env++] = x;
1851 }
1852
1853 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1854 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1855 return -ENOMEM;
1856 our_env[n_env++] = x;
1857
1858 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1859 return -ENOMEM;
1860 our_env[n_env++] = x;
1861 }
1862
1863 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1864 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1865 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1866 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1867 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1868 if (!x)
1869 return -ENOMEM;
1870 our_env[n_env++] = x;
1871 }
1872
1873 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1874 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1875 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1876 * SetLoginEnvironment= switch. */
1877 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1878 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1879 if (r < 0)
1880 return log_exec_debug_errno(c,
1881 p,
1882 r,
1883 "Failed to determine user credentials for root: %m");
1884 }
1885
d1a5be82 1886 bool set_user_login_env = exec_context_get_set_login_environment(c);
75689fb2
LB
1887
1888 if (username) {
1889 x = strjoin("USER=", username);
1890 if (!x)
1891 return -ENOMEM;
1892 our_env[n_env++] = x;
1893
1894 if (set_user_login_env) {
1895 x = strjoin("LOGNAME=", username);
1896 if (!x)
1897 return -ENOMEM;
1898 our_env[n_env++] = x;
1899 }
1900 }
1901
1902 if (home && set_user_login_env) {
1903 x = strjoin("HOME=", home);
1904 if (!x)
1905 return -ENOMEM;
1906
1907 path_simplify(x + 5);
1908 our_env[n_env++] = x;
1909 }
1910
1911 if (shell && set_user_login_env) {
1912 x = strjoin("SHELL=", shell);
1913 if (!x)
1914 return -ENOMEM;
1915
1916 path_simplify(x + 6);
1917 our_env[n_env++] = x;
1918 }
1919
1920 if (!sd_id128_is_null(p->invocation_id)) {
1921 assert(p->invocation_id_string);
1922
1923 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1924 if (!x)
1925 return -ENOMEM;
1926
1927 our_env[n_env++] = x;
1928 }
1929
1930 if (exec_context_needs_term(c)) {
1931 _cleanup_free_ char *cmdline = NULL;
1932 const char *tty_path, *term = NULL;
1933
1934 tty_path = exec_context_tty_path(c);
1935
1936 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1937 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1938 * container manager passes to PID 1 ends up all the way in the console login shown. */
1939
1940 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1941 term = getenv("TERM");
1942 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1943 _cleanup_free_ char *key = NULL;
1944
1945 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1946 if (!key)
1947 return -ENOMEM;
1948
1949 r = proc_cmdline_get_key(key, 0, &cmdline);
1950 if (r < 0)
1951 log_exec_debug_errno(c,
1952 p,
1953 r,
1954 "Failed to read %s from kernel cmdline, ignoring: %m",
1955 key);
1956 else if (r > 0)
1957 term = cmdline;
1958 }
1959
1960 if (!term)
1961 term = default_term_for_tty(tty_path);
1962
1963 x = strjoin("TERM=", term);
1964 if (!x)
1965 return -ENOMEM;
1966 our_env[n_env++] = x;
1967 }
1968
1969 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1970 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1971 return -ENOMEM;
1972
1973 our_env[n_env++] = x;
1974 }
1975
1976 if (c->log_namespace) {
1977 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1978 if (!x)
1979 return -ENOMEM;
1980
1981 our_env[n_env++] = x;
1982 }
1983
1984 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1985 _cleanup_free_ char *joined = NULL;
1986 const char *n;
1987
1988 if (!p->prefix[t])
1989 continue;
1990
1991 if (c->directories[t].n_items == 0)
1992 continue;
1993
1994 n = exec_directory_env_name_to_string(t);
1995 if (!n)
1996 continue;
1997
1998 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1999 _cleanup_free_ char *prefixed = NULL;
2000
2001 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2002 if (!prefixed)
2003 return -ENOMEM;
2004
2005 if (!strextend_with_separator(&joined, ":", prefixed))
2006 return -ENOMEM;
2007 }
2008
2009 x = strjoin(n, "=", joined);
2010 if (!x)
2011 return -ENOMEM;
2012
2013 our_env[n_env++] = x;
2014 }
2015
2016 _cleanup_free_ char *creds_dir = NULL;
2017 r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
2018 if (r < 0)
2019 return r;
2020 if (r > 0) {
2021 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2022 if (!x)
2023 return -ENOMEM;
2024
2025 our_env[n_env++] = x;
2026 }
2027
2028 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2029 return -ENOMEM;
2030
2031 our_env[n_env++] = x;
2032
2033 if (memory_pressure_path) {
2034 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2035 if (!x)
2036 return -ENOMEM;
2037
2038 our_env[n_env++] = x;
2039
2040 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2041 _cleanup_free_ char *b = NULL, *e = NULL;
2042
2043 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2044 MEMORY_PRESSURE_DEFAULT_TYPE,
2045 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2046 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2047 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2048 return -ENOMEM;
2049
2050 if (base64mem(b, strlen(b) + 1, &e) < 0)
2051 return -ENOMEM;
2052
2053 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2054 if (!x)
2055 return -ENOMEM;
2056
2057 our_env[n_env++] = x;
2058 }
2059 }
2060
2061 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2062#undef N_ENV_VARS
2063
2064 *ret = TAKE_PTR(our_env);
2065
2066 return 0;
2067}
2068
2069static int build_pass_environment(const ExecContext *c, char ***ret) {
2070 _cleanup_strv_free_ char **pass_env = NULL;
2071 size_t n_env = 0;
2072
2073 STRV_FOREACH(i, c->pass_environment) {
2074 _cleanup_free_ char *x = NULL;
2075 char *v;
2076
2077 v = getenv(*i);
2078 if (!v)
2079 continue;
2080 x = strjoin(*i, "=", v);
2081 if (!x)
2082 return -ENOMEM;
2083
2084 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2085 return -ENOMEM;
2086
2087 pass_env[n_env++] = TAKE_PTR(x);
2088 pass_env[n_env] = NULL;
2089 }
2090
2091 *ret = TAKE_PTR(pass_env);
2092
2093 return 0;
2094}
2095
2096static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2097 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
71136404 2098 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
75689fb2
LB
2099 _cleanup_close_ int unshare_ready_fd = -EBADF;
2100 _cleanup_(sigkill_waitp) pid_t pid = 0;
2101 uint64_t c = 1;
2102 ssize_t n;
2103 int r;
2104
2105 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2106 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2107 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2108 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2109 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2110 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2111 * continues execution normally.
2112 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2113 * does not need CAP_SETUID to write the single line mapping to itself. */
2114
2115 /* Can only set up multiple mappings with CAP_SETUID. */
2116 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2117 r = asprintf(&uid_map,
2118 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2119 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2120 ouid, ouid, uid, uid);
2121 else
2122 r = asprintf(&uid_map,
2123 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2124 ouid, ouid);
2125
2126 if (r < 0)
2127 return -ENOMEM;
2128
2129 /* Can only set up multiple mappings with CAP_SETGID. */
2130 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2131 r = asprintf(&gid_map,
2132 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2133 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2134 ogid, ogid, gid, gid);
2135 else
2136 r = asprintf(&gid_map,
2137 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2138 ogid, ogid);
2139
2140 if (r < 0)
2141 return -ENOMEM;
2142
2143 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2144 * namespace. */
2145 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2146 if (unshare_ready_fd < 0)
2147 return -errno;
2148
2149 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2150 * failed. */
2151 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2152 return -errno;
2153
e9ccae31 2154 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
75689fb2
LB
2155 if (r < 0)
2156 return r;
2157 if (r == 0) {
2158 _cleanup_close_ int fd = -EBADF;
2159 const char *a;
2160 pid_t ppid;
2161
2162 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2163 * here, after the parent opened its own user namespace. */
2164
2165 ppid = getppid();
2166 errno_pipe[0] = safe_close(errno_pipe[0]);
2167
2168 /* Wait until the parent unshared the user namespace */
2169 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2170 r = -errno;
2171 goto child_fail;
2172 }
2173
2174 /* Disable the setgroups() system call in the child user namespace, for good. */
2175 a = procfs_file_alloca(ppid, "setgroups");
2176 fd = open(a, O_WRONLY|O_CLOEXEC);
2177 if (fd < 0) {
2178 if (errno != ENOENT) {
2179 r = -errno;
2180 goto child_fail;
2181 }
2182
2183 /* If the file is missing the kernel is too old, let's continue anyway. */
2184 } else {
2185 if (write(fd, "deny\n", 5) < 0) {
2186 r = -errno;
2187 goto child_fail;
2188 }
2189
2190 fd = safe_close(fd);
2191 }
2192
2193 /* First write the GID map */
2194 a = procfs_file_alloca(ppid, "gid_map");
2195 fd = open(a, O_WRONLY|O_CLOEXEC);
2196 if (fd < 0) {
2197 r = -errno;
2198 goto child_fail;
2199 }
2200 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2201 r = -errno;
2202 goto child_fail;
2203 }
2204 fd = safe_close(fd);
2205
2206 /* The write the UID map */
2207 a = procfs_file_alloca(ppid, "uid_map");
2208 fd = open(a, O_WRONLY|O_CLOEXEC);
2209 if (fd < 0) {
2210 r = -errno;
2211 goto child_fail;
2212 }
2213 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2214 r = -errno;
2215 goto child_fail;
2216 }
2217
2218 _exit(EXIT_SUCCESS);
2219
2220 child_fail:
2221 (void) write(errno_pipe[1], &r, sizeof(r));
2222 _exit(EXIT_FAILURE);
2223 }
2224
2225 errno_pipe[1] = safe_close(errno_pipe[1]);
2226
2227 if (unshare(CLONE_NEWUSER) < 0)
2228 return -errno;
2229
2230 /* Let the child know that the namespace is ready now */
2231 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2232 return -errno;
2233
2234 /* Try to read an error code from the child */
2235 n = read(errno_pipe[0], &r, sizeof(r));
2236 if (n < 0)
2237 return -errno;
2238 if (n == sizeof(r)) { /* an error code was sent to us */
2239 if (r < 0)
2240 return r;
2241 return -EIO;
2242 }
2243 if (n != 0) /* on success we should have read 0 bytes */
2244 return -EIO;
2245
2246 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2247 if (r < 0)
2248 return r;
2249 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2250 return -EIO;
2251
2252 return 0;
2253}
2254
2255static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2256 _cleanup_free_ char *src_abs = NULL;
2257 int r;
2258
2259 assert(source);
2260
2261 src_abs = path_join(root, source);
2262 if (!src_abs)
2263 return -ENOMEM;
2264
2265 STRV_FOREACH(dst, symlinks) {
2266 _cleanup_free_ char *dst_abs = NULL;
2267
2268 dst_abs = path_join(root, *dst);
2269 if (!dst_abs)
2270 return -ENOMEM;
2271
2272 r = mkdir_parents_label(dst_abs, 0755);
2273 if (r < 0)
2274 return r;
2275
2276 r = symlink_idempotent(src_abs, dst_abs, true);
2277 if (r < 0)
2278 return r;
2279 }
2280
2281 return 0;
2282}
2283
2284static int setup_exec_directory(
2285 const ExecContext *context,
2286 const ExecParameters *params,
2287 uid_t uid,
2288 gid_t gid,
2289 ExecDirectoryType type,
2290 bool needs_mount_namespace,
2291 int *exit_status) {
2292
2293 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2294 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2295 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2296 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2297 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2298 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2299 };
2300 int r;
2301
2302 assert(context);
2303 assert(params);
2304 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2305 assert(exit_status);
2306
2307 if (!params->prefix[type])
2308 return 0;
2309
2310 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2311 if (!uid_is_valid(uid))
2312 uid = 0;
2313 if (!gid_is_valid(gid))
2314 gid = 0;
2315 }
2316
2317 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2318 _cleanup_free_ char *p = NULL, *pp = NULL;
2319
2320 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2321 if (!p) {
2322 r = -ENOMEM;
2323 goto fail;
2324 }
2325
2326 r = mkdir_parents_label(p, 0755);
2327 if (r < 0)
2328 goto fail;
2329
2330 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2331
2332 /* If we are in user mode, and a configuration directory exists but a state directory
2333 * doesn't exist, then we likely are upgrading from an older systemd version that
2334 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2335 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2336 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2337 * separated. If a service has both dirs configured but only the configuration dir
2338 * exists and the state dir does not, we assume we are looking at an update
2339 * situation. Hence, create a compatibility symlink, so that all expectations are
2340 * met.
2341 *
2342 * (We also do something similar with the log directory, which still doesn't exist in
2343 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2344
2345 /* this assumes the state dir is always created before the configuration dir */
2346 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2347 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2348
2349 r = laccess(p, F_OK);
2350 if (r == -ENOENT) {
2351 _cleanup_free_ char *q = NULL;
2352
2353 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2354 * under the configuration hierarchy. */
2355
2356 if (type == EXEC_DIRECTORY_STATE)
2357 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2358 else if (type == EXEC_DIRECTORY_LOGS)
2359 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2360 else
2361 assert_not_reached();
2362 if (!q) {
2363 r = -ENOMEM;
2364 goto fail;
2365 }
2366
2367 r = laccess(q, F_OK);
2368 if (r >= 0) {
2369 /* It does exist! This hence looks like an update. Symlink the
2370 * configuration directory into the state directory. */
2371
2372 r = symlink_idempotent(q, p, /* make_relative= */ true);
2373 if (r < 0)
2374 goto fail;
2375
2376 log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2377 continue;
2378 } else if (r != -ENOENT)
2379 log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2380
2381 } else if (r < 0)
2382 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2383 }
2384
2385 if (exec_directory_is_private(context, type)) {
2386 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2387 * case we want to avoid leaving a directory around fully accessible that is owned by
2388 * a dynamic user whose UID is later on reused. To lock this down we use the same
2389 * trick used by container managers to prohibit host users to get access to files of
2390 * the same UID in containers: we place everything inside a directory that has an
2391 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2392 * for unprivileged host code. We then use fs namespacing to make this directory
2393 * permeable for the service itself.
2394 *
2395 * Specifically: for a service which wants a special directory "foo/" we first create
2396 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2397 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2398 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2399 * unprivileged host users can't look into it. Inside of the namespace of the unit
2400 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2401 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2402 * for the service and making sure it only gets access to the dirs it needs but no
2403 * others. Tricky? Yes, absolutely, but it works!
2404 *
2405 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2406 * to be owned by the service itself.
2407 *
2408 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2409 * for sharing files or sockets with other services. */
2410
2411 pp = path_join(params->prefix[type], "private");
2412 if (!pp) {
2413 r = -ENOMEM;
2414 goto fail;
2415 }
2416
2417 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2418 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2419 if (r < 0)
2420 goto fail;
2421
2422 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2423 r = -ENOMEM;
2424 goto fail;
2425 }
2426
2427 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2428 r = mkdir_parents_label(pp, 0755);
2429 if (r < 0)
2430 goto fail;
2431
2432 if (is_dir(p, false) > 0 &&
2433 (laccess(pp, F_OK) == -ENOENT)) {
2434
2435 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2436 * it over. Most likely the service has been upgraded from one that didn't use
2437 * DynamicUser=1, to one that does. */
2438
2439 log_exec_info(context,
2440 params,
2441 "Found pre-existing public %s= directory %s, migrating to %s.\n"
2442 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2443 exec_directory_type_to_string(type), p, pp);
2444
2445 r = RET_NERRNO(rename(p, pp));
2446 if (r < 0)
2447 goto fail;
2448 } else {
2449 /* Otherwise, create the actual directory for the service */
2450
2451 r = mkdir_label(pp, context->directories[type].mode);
2452 if (r < 0 && r != -EEXIST)
2453 goto fail;
2454 }
2455
2456 if (!context->directories[type].items[i].only_create) {
2457 /* And link it up from the original place.
2458 * Notes
2459 * 1) If a mount namespace is going to be used, then this symlink remains on
2460 * the host, and a new one for the child namespace will be created later.
2461 * 2) It is not necessary to create this symlink when one of its parent
2462 * directories is specified and already created. E.g.
2463 * StateDirectory=foo foo/bar
2464 * In that case, the inode points to pp and p for "foo/bar" are the same:
2465 * pp = "/var/lib/private/foo/bar"
2466 * p = "/var/lib/foo/bar"
2467 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2468 * we do not need to create the symlink, but we cannot create the symlink.
2469 * See issue #24783. */
2470 r = symlink_idempotent(pp, p, true);
2471 if (r < 0)
2472 goto fail;
2473 }
2474
2475 } else {
2476 _cleanup_free_ char *target = NULL;
2477
2478 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2479 readlink_and_make_absolute(p, &target) >= 0) {
2480 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2481
2482 /* This already exists and is a symlink? Interesting. Maybe it's one created
2483 * by DynamicUser=1 (see above)?
2484 *
2485 * We do this for all directory types except for ConfigurationDirectory=,
2486 * since they all support the private/ symlink logic at least in some
2487 * configurations, see above. */
2488
2489 r = chase(target, NULL, 0, &target_resolved, NULL);
2490 if (r < 0)
2491 goto fail;
2492
2493 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2494 if (!q) {
2495 r = -ENOMEM;
2496 goto fail;
2497 }
2498
2499 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2500 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2501 if (r < 0)
2502 goto fail;
2503
2504 if (path_equal(q_resolved, target_resolved)) {
2505
2506 /* Hmm, apparently DynamicUser= was once turned on for this service,
2507 * but is no longer. Let's move the directory back up. */
2508
2509 log_exec_info(context,
2510 params,
2511 "Found pre-existing private %s= directory %s, migrating to %s.\n"
2512 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2513 exec_directory_type_to_string(type), q, p);
2514
2515 r = RET_NERRNO(unlink(p));
2516 if (r < 0)
2517 goto fail;
2518
2519 r = RET_NERRNO(rename(q, p));
2520 if (r < 0)
2521 goto fail;
2522 }
2523 }
2524
2525 r = mkdir_label(p, context->directories[type].mode);
2526 if (r < 0) {
2527 if (r != -EEXIST)
2528 goto fail;
2529
2530 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2531 struct stat st;
2532
2533 /* Don't change the owner/access mode of the configuration directory,
2534 * as in the common case it is not written to by a service, and shall
2535 * not be writable. */
2536
2537 r = RET_NERRNO(stat(p, &st));
2538 if (r < 0)
2539 goto fail;
2540
2541 /* Still complain if the access mode doesn't match */
2542 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2543 log_exec_warning(context,
2544 params,
2545 "%s \'%s\' already exists but the mode is different. "
2546 "(File system: %o %sMode: %o)",
2547 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2548 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2549
2550 continue;
2551 }
2552 }
2553 }
2554
2555 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2556 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2557 * current UID/GID ownership.) */
2558 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2559 if (r < 0)
2560 goto fail;
2561
2562 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2563 * available to user code anyway */
2564 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2565 continue;
2566
2567 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2568 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2569 * assignments to exist. */
2570 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2571 if (r < 0)
2572 goto fail;
2573 }
2574
2575 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2576 * they are set up later, to allow configuring empty var/run/etc. */
2577 if (!needs_mount_namespace)
2578 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2579 r = create_many_symlinks(params->prefix[type],
2580 context->directories[type].items[i].path,
2581 context->directories[type].items[i].symlinks);
2582 if (r < 0)
2583 goto fail;
2584 }
2585
2586 return 0;
2587
2588fail:
2589 *exit_status = exit_status_table[type];
2590 return r;
2591}
2592
2593#if ENABLE_SMACK
2594static int setup_smack(
2595 const ExecParameters *params,
2596 const ExecContext *context,
2597 int executable_fd) {
2598 int r;
2599
2600 assert(params);
2601 assert(executable_fd >= 0);
2602
2603 if (context->smack_process_label) {
2604 r = mac_smack_apply_pid(0, context->smack_process_label);
2605 if (r < 0)
2606 return r;
2607 } else if (params->fallback_smack_process_label) {
2608 _cleanup_free_ char *exec_label = NULL;
2609
2610 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2611 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2612 return r;
2613
2614 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2615 if (r < 0)
2616 return r;
2617 }
2618
2619 return 0;
2620}
2621#endif
2622
2623static int compile_bind_mounts(
2624 const ExecContext *context,
2625 const ExecParameters *params,
2626 BindMount **ret_bind_mounts,
2627 size_t *ret_n_bind_mounts,
2628 char ***ret_empty_directories) {
2629
2630 _cleanup_strv_free_ char **empty_directories = NULL;
2631 BindMount *bind_mounts = NULL;
2632 size_t n, h = 0;
2633 int r;
2634
2635 assert(context);
2636 assert(params);
2637 assert(ret_bind_mounts);
2638 assert(ret_n_bind_mounts);
2639 assert(ret_empty_directories);
2640
2641 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2642
2643 n = context->n_bind_mounts;
2644 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2645 if (!params->prefix[t])
2646 continue;
2647
2648 for (size_t i = 0; i < context->directories[t].n_items; i++)
2649 n += !context->directories[t].items[i].only_create;
2650 }
2651
2652 if (n <= 0) {
2653 *ret_bind_mounts = NULL;
2654 *ret_n_bind_mounts = 0;
2655 *ret_empty_directories = NULL;
2656 return 0;
2657 }
2658
2659 bind_mounts = new(BindMount, n);
2660 if (!bind_mounts)
2661 return -ENOMEM;
2662
2663 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2664 BindMount *item = context->bind_mounts + i;
2665 _cleanup_free_ char *s = NULL, *d = NULL;
2666
2667 s = strdup(item->source);
2668 if (!s)
2669 return -ENOMEM;
2670
2671 d = strdup(item->destination);
2672 if (!d)
2673 return -ENOMEM;
2674
2675 bind_mounts[h++] = (BindMount) {
2676 .source = TAKE_PTR(s),
2677 .destination = TAKE_PTR(d),
2678 .read_only = item->read_only,
2679 .recursive = item->recursive,
2680 .ignore_enoent = item->ignore_enoent,
2681 };
2682 }
2683
2684 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2685 if (!params->prefix[t])
2686 continue;
2687
2688 if (context->directories[t].n_items == 0)
2689 continue;
2690
2691 if (exec_directory_is_private(context, t) &&
2692 !exec_context_with_rootfs(context)) {
2693 char *private_root;
2694
2695 /* So this is for a dynamic user, and we need to make sure the process can access its own
2696 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2697 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2698
2699 private_root = path_join(params->prefix[t], "private");
2700 if (!private_root)
2701 return -ENOMEM;
2702
2703 r = strv_consume(&empty_directories, private_root);
2704 if (r < 0)
2705 return r;
2706 }
2707
2708 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2709 _cleanup_free_ char *s = NULL, *d = NULL;
2710
2711 /* When one of the parent directories is in the list, we cannot create the symlink
2712 * for the child directory. See also the comments in setup_exec_directory(). */
2713 if (context->directories[t].items[i].only_create)
2714 continue;
2715
2716 if (exec_directory_is_private(context, t))
2717 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2718 else
2719 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2720 if (!s)
2721 return -ENOMEM;
2722
2723 if (exec_directory_is_private(context, t) &&
2724 exec_context_with_rootfs(context))
2725 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2726 * directory is not created on the root directory. So, let's bind-mount the directory
2727 * on the 'non-private' place. */
2728 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2729 else
2730 d = strdup(s);
2731 if (!d)
2732 return -ENOMEM;
2733
2734 bind_mounts[h++] = (BindMount) {
2735 .source = TAKE_PTR(s),
2736 .destination = TAKE_PTR(d),
2737 .read_only = false,
2738 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2739 .recursive = true,
2740 .ignore_enoent = false,
2741 };
2742 }
2743 }
2744
2745 assert(h == n);
2746
2747 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2748 *ret_n_bind_mounts = n;
2749 *ret_empty_directories = TAKE_PTR(empty_directories);
2750
2751 return (int) n;
2752}
2753
2754/* ret_symlinks will contain a list of pairs src:dest that describes
2755 * the symlinks to create later on. For example, the symlinks needed
2756 * to safely give private directories to DynamicUser=1 users. */
2757static int compile_symlinks(
2758 const ExecContext *context,
2759 const ExecParameters *params,
2760 bool setup_os_release_symlink,
2761 char ***ret_symlinks) {
2762
2763 _cleanup_strv_free_ char **symlinks = NULL;
2764 int r;
2765
2766 assert(context);
2767 assert(params);
2768 assert(ret_symlinks);
2769
2770 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2771 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2772 _cleanup_free_ char *private_path = NULL, *path = NULL;
2773
2774 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2775 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2776
2777 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2778 dst_abs = path_join(params->prefix[dt], *symlink);
2779 if (!src_abs || !dst_abs)
2780 return -ENOMEM;
2781
2782 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2783 if (r < 0)
2784 return r;
2785 }
2786
2787 if (!exec_directory_is_private(context, dt) ||
2788 exec_context_with_rootfs(context) ||
2789 context->directories[dt].items[i].only_create)
2790 continue;
2791
2792 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2793 if (!private_path)
2794 return -ENOMEM;
2795
2796 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2797 if (!path)
2798 return -ENOMEM;
2799
2800 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2801 if (r < 0)
2802 return r;
2803 }
2804 }
2805
2806 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2807 * and readers will never get a half-written version. Note that, while the paths specified here are
2808 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2809 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2810 if (setup_os_release_symlink) {
2811 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2812 if (r < 0)
2813 return r;
2814
2815 r = strv_extend(&symlinks, "/run/host/os-release");
2816 if (r < 0)
2817 return r;
2818 }
2819
2820 *ret_symlinks = TAKE_PTR(symlinks);
2821
2822 return 0;
2823}
2824
2825static bool insist_on_sandboxing(
2826 const ExecContext *context,
2827 const char *root_dir,
2828 const char *root_image,
2829 const BindMount *bind_mounts,
2830 size_t n_bind_mounts) {
2831
2832 assert(context);
2833 assert(n_bind_mounts == 0 || bind_mounts);
2834
2835 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2836 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2837 * rearrange stuff in a way we cannot ignore gracefully. */
2838
2839 if (context->n_temporary_filesystems > 0)
2840 return true;
2841
2842 if (root_dir || root_image)
2843 return true;
2844
2845 if (context->n_mount_images > 0)
2846 return true;
2847
2848 if (context->dynamic_user)
2849 return true;
2850
2851 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2852 return true;
2853
2854 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2855 * essential. */
2856 for (size_t i = 0; i < n_bind_mounts; i++)
2857 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2858 return true;
2859
2860 if (context->log_namespace)
2861 return true;
2862
2863 return false;
2864}
2865
2866static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
2867 _cleanup_close_ int fd = -EBADF;
2868 int r;
2869
2870 if (!runtime || !runtime->ephemeral_copy)
2871 return 0;
2872
2873 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2874 if (r < 0)
2875 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2876
2877 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2878
2879 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2880 if (fd >= 0)
2881 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2882 return 0;
2883
2884 if (fd != -EAGAIN)
2885 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2886
2887 log_debug("Making ephemeral snapshot of %s to %s",
2888 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
2889
2890 if (context->root_image)
2891 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
2892 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
2893 else
2894 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
2895 AT_FDCWD, runtime->ephemeral_copy,
2896 BTRFS_SNAPSHOT_FALLBACK_COPY |
2897 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2898 BTRFS_SNAPSHOT_RECURSIVE |
2899 BTRFS_SNAPSHOT_LOCK_BSD);
2900 if (fd < 0)
2901 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
2902 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
2903
2904 if (context->root_image) {
2905 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2906 * which tends to not perform well in combination with lots of random writes.
2907 *
2908 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2909 * copy, but we at least want to make the intention clear.
2910 */
2911 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2912 if (r < 0)
2913 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
2914 }
2915
2916 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2917 if (r < 0)
2918 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2919
2920 return 1;
2921}
2922
2923static int verity_settings_prepare(
2924 VeritySettings *verity,
2925 const char *root_image,
2926 const void *root_hash,
2927 size_t root_hash_size,
2928 const char *root_hash_path,
2929 const void *root_hash_sig,
2930 size_t root_hash_sig_size,
2931 const char *root_hash_sig_path,
2932 const char *verity_data_path) {
2933
2934 int r;
2935
2936 assert(verity);
2937
2938 if (root_hash) {
2939 void *d;
2940
2941 d = memdup(root_hash, root_hash_size);
2942 if (!d)
2943 return -ENOMEM;
2944
2945 free_and_replace(verity->root_hash, d);
2946 verity->root_hash_size = root_hash_size;
2947 verity->designator = PARTITION_ROOT;
2948 }
2949
2950 if (root_hash_sig) {
2951 void *d;
2952
2953 d = memdup(root_hash_sig, root_hash_sig_size);
2954 if (!d)
2955 return -ENOMEM;
2956
2957 free_and_replace(verity->root_hash_sig, d);
2958 verity->root_hash_sig_size = root_hash_sig_size;
2959 verity->designator = PARTITION_ROOT;
2960 }
2961
2962 if (verity_data_path) {
2963 r = free_and_strdup(&verity->data_path, verity_data_path);
2964 if (r < 0)
2965 return r;
2966 }
2967
2968 r = verity_settings_load(
2969 verity,
2970 root_image,
2971 root_hash_path,
2972 root_hash_sig_path);
2973 if (r < 0)
2974 return log_debug_errno(r, "Failed to load root hash: %m");
2975
2976 return 0;
2977}
2978
2979static int apply_mount_namespace(
2980 ExecCommandFlags command_flags,
2981 const ExecContext *context,
2982 const ExecParameters *params,
2983 ExecRuntime *runtime,
2984 const char *memory_pressure_path,
2985 char **error_path) {
2986
2987 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
2988 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
2989 **read_write_paths_cleanup = NULL;
2990 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
2991 *extension_dir = NULL, *host_os_release_stage = NULL;
2992 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
2993 char **read_write_paths;
2994 bool needs_sandboxing, setup_os_release_symlink;
2995 BindMount *bind_mounts = NULL;
2996 size_t n_bind_mounts = 0;
2997 int r;
2998
2999 assert(context);
3000
3001 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3002
3003 if (params->flags & EXEC_APPLY_CHROOT) {
3004 r = setup_ephemeral(context, runtime);
3005 if (r < 0)
3006 return r;
3007
3008 if (context->root_image)
3009 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3010 else
3011 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3012 }
3013
3014 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3015 if (r < 0)
3016 return r;
3017
3018 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3019 * service will need to write to it in order to start the notifications. */
3020 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3021 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3022 if (!read_write_paths_cleanup)
3023 return -ENOMEM;
3024
3025 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3026 if (r < 0)
3027 return r;
3028
3029 read_write_paths = read_write_paths_cleanup;
3030 } else
3031 read_write_paths = context->read_write_paths;
3032
3033 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3034 if (needs_sandboxing) {
3035 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3036 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3037 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3038
3039 if (context->private_tmp && runtime && runtime->shared) {
3040 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3041 tmp_dir = runtime->shared->tmp_dir;
3042 else if (runtime->shared->tmp_dir)
3043 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3044
3045 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3046 var_tmp_dir = runtime->shared->var_tmp_dir;
3047 else if (runtime->shared->var_tmp_dir)
3048 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3049 }
3050 }
3051
3052 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3053 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3054 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3055 if (r < 0)
3056 return r;
3057
3058 if (context->mount_propagation_flag == MS_SHARED)
3059 log_exec_debug(context,
3060 params,
3061 "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3062
3063 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3064 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3065 if (r < 0)
3066 return r;
3067 }
3068
3069 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3070 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3071 if (!propagate_dir)
3072 return -ENOMEM;
3073
3074 incoming_dir = strdup("/run/systemd/incoming");
3075 if (!incoming_dir)
3076 return -ENOMEM;
3077
3078 extension_dir = strdup("/run/systemd/unit-extensions");
3079 if (!extension_dir)
3080 return -ENOMEM;
3081
3082 /* If running under a different root filesystem, propagate the host's os-release. We make a
3083 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3084 if (setup_os_release_symlink) {
3085 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3086 if (!host_os_release_stage)
3087 return -ENOMEM;
3088 }
3089 } else {
3090 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3091
3092 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3093 return -ENOMEM;
3094
3095 if (setup_os_release_symlink) {
3096 if (asprintf(&host_os_release_stage,
3097 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3098 geteuid()) < 0)
3099 return -ENOMEM;
3100 }
3101 }
3102
3103 if (root_image) {
3104 r = verity_settings_prepare(
3105 &verity,
3106 root_image,
3107 context->root_hash, context->root_hash_size, context->root_hash_path,
3108 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3109 context->root_verity);
3110 if (r < 0)
3111 return r;
3112 }
3113
3114 NamespaceParameters parameters = {
3115 .runtime_scope = params->runtime_scope,
3116
3117 .root_directory = root_dir,
3118 .root_image = root_image,
3119 .root_image_options = context->root_image_options,
3120 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3121
3122 .read_write_paths = read_write_paths,
3123 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3124 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3125
3126 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3127 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3128
3129 .empty_directories = empty_directories,
3130 .symlinks = symlinks,
3131
3132 .bind_mounts = bind_mounts,
3133 .n_bind_mounts = n_bind_mounts,
3134
3135 .temporary_filesystems = context->temporary_filesystems,
3136 .n_temporary_filesystems = context->n_temporary_filesystems,
3137
3138 .mount_images = context->mount_images,
3139 .n_mount_images = context->n_mount_images,
3140 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3141
3142 .tmp_dir = tmp_dir,
3143 .var_tmp_dir = var_tmp_dir,
3144
3145 .creds_path = creds_path,
3146 .log_namespace = context->log_namespace,
3147 .mount_propagation_flag = context->mount_propagation_flag,
3148
3149 .verity = &verity,
3150
3151 .extension_images = context->extension_images,
3152 .n_extension_images = context->n_extension_images,
3153 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3154 .extension_directories = context->extension_directories,
3155
3156 .propagate_dir = propagate_dir,
3157 .incoming_dir = incoming_dir,
3158 .extension_dir = extension_dir,
3159 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3160 .host_os_release_stage = host_os_release_stage,
3161
3162 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3163 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3164 * sandbox inside the mount namespace. */
3165 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3166
3167 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3168 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3169 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3170 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3171 .protect_hostname = needs_sandboxing && context->protect_hostname,
3172
3173 .private_dev = needs_sandboxing && context->private_devices,
3174 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3175 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3176
3177 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3178
3179 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3180 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3181
abcf5997
FS
3182 .protect_home = needs_sandboxing ? context->protect_home : false,
3183 .protect_system = needs_sandboxing ? context->protect_system : false,
3184 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3185 .proc_subset = needs_sandboxing ? context->proc_subset : false,
75689fb2
LB
3186 };
3187
3188 r = setup_namespace(&parameters, error_path);
3189 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3190 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3191 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3192 * completely different execution environment. */
3193 if (r == -ENOANO) {
3194 if (insist_on_sandboxing(
3195 context,
3196 root_dir, root_image,
3197 bind_mounts,
3198 n_bind_mounts))
3199 return log_exec_debug_errno(context,
3200 params,
3201 SYNTHETIC_ERRNO(EOPNOTSUPP),
3202 "Failed to set up namespace, and refusing to continue since "
3203 "the selected namespacing options alter mount environment non-trivially.\n"
3204 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3205 n_bind_mounts,
3206 context->n_temporary_filesystems,
3207 yes_no(root_dir),
3208 yes_no(root_image),
3209 yes_no(context->dynamic_user));
3210
3211 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3212 return 0;
3213 }
3214
3215 return r;
3216}
3217
3218static int apply_working_directory(
3219 const ExecContext *context,
3220 const ExecParameters *params,
3221 ExecRuntime *runtime,
3222 const char *home,
3223 int *exit_status) {
3224
3225 const char *d, *wd;
3226
3227 assert(context);
3228 assert(exit_status);
3229
3230 if (context->working_directory_home) {
3231
3232 if (!home) {
3233 *exit_status = EXIT_CHDIR;
3234 return -ENXIO;
3235 }
3236
3237 wd = home;
3238
3239 } else
3240 wd = empty_to_root(context->working_directory);
3241
3242 if (params->flags & EXEC_APPLY_CHROOT)
3243 d = wd;
3244 else
3245 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3246
3247 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3248 *exit_status = EXIT_CHDIR;
3249 return -errno;
3250 }
3251
3252 return 0;
3253}
3254
3255static int apply_root_directory(
3256 const ExecContext *context,
3257 const ExecParameters *params,
3258 ExecRuntime *runtime,
3259 const bool needs_mount_ns,
3260 int *exit_status) {
3261
3262 assert(context);
3263 assert(exit_status);
3264
3265 if (params->flags & EXEC_APPLY_CHROOT)
3266 if (!needs_mount_ns && context->root_directory)
3267 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3268 *exit_status = EXIT_CHROOT;
3269 return -errno;
3270 }
3271
3272 return 0;
3273}
3274
3275static int setup_keyring(
3276 const ExecContext *context,
3277 const ExecParameters *p,
3278 uid_t uid, gid_t gid) {
3279
3280 key_serial_t keyring;
3281 int r = 0;
3282 uid_t saved_uid;
3283 gid_t saved_gid;
3284
3285 assert(context);
3286 assert(p);
3287
3288 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3289 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3290 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3291 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3292 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3293 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3294
3295 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3296 return 0;
3297
3298 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3299 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3300 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3301 * & group is just as nasty as acquiring a reference to the user keyring. */
3302
3303 saved_uid = getuid();
3304 saved_gid = getgid();
3305
3306 if (gid_is_valid(gid) && gid != saved_gid) {
3307 if (setregid(gid, -1) < 0)
3308 return log_exec_error_errno(context,
3309 p,
3310 errno,
3311 "Failed to change GID for user keyring: %m");
3312 }
3313
3314 if (uid_is_valid(uid) && uid != saved_uid) {
3315 if (setreuid(uid, -1) < 0) {
3316 r = log_exec_error_errno(context,
3317 p,
3318 errno,
3319 "Failed to change UID for user keyring: %m");
3320 goto out;
3321 }
3322 }
3323
3324 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3325 if (keyring == -1) {
3326 if (errno == ENOSYS)
3327 log_exec_debug_errno(context,
3328 p,
3329 errno,
3330 "Kernel keyring not supported, ignoring.");
3331 else if (ERRNO_IS_PRIVILEGE(errno))
3332 log_exec_debug_errno(context,
3333 p,
3334 errno,
3335 "Kernel keyring access prohibited, ignoring.");
3336 else if (errno == EDQUOT)
3337 log_exec_debug_errno(context,
3338 p,
3339 errno,
3340 "Out of kernel keyrings to allocate, ignoring.");
3341 else
3342 r = log_exec_error_errno(context,
3343 p,
3344 errno,
3345 "Setting up kernel keyring failed: %m");
3346
3347 goto out;
3348 }
3349
3350 /* When requested link the user keyring into the session keyring. */
3351 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3352
3353 if (keyctl(KEYCTL_LINK,
3354 KEY_SPEC_USER_KEYRING,
3355 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3356 r = log_exec_error_errno(context,
3357 p,
3358 errno,
3359 "Failed to link user keyring into session keyring: %m");
3360 goto out;
3361 }
3362 }
3363
3364 /* Restore uid/gid back */
3365 if (uid_is_valid(uid) && uid != saved_uid) {
3366 if (setreuid(saved_uid, -1) < 0) {
3367 r = log_exec_error_errno(context,
3368 p,
3369 errno,
3370 "Failed to change UID back for user keyring: %m");
3371 goto out;
3372 }
3373 }
3374
3375 if (gid_is_valid(gid) && gid != saved_gid) {
3376 if (setregid(saved_gid, -1) < 0)
3377 return log_exec_error_errno(context,
3378 p,
3379 errno,
3380 "Failed to change GID back for user keyring: %m");
3381 }
3382
3383 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3384 if (!sd_id128_is_null(p->invocation_id)) {
3385 key_serial_t key;
3386
3387 key = add_key("user",
3388 "invocation_id",
3389 &p->invocation_id,
3390 sizeof(p->invocation_id),
3391 KEY_SPEC_SESSION_KEYRING);
3392 if (key == -1)
3393 log_exec_debug_errno(context,
3394 p,
3395 errno,
3396 "Failed to add invocation ID to keyring, ignoring: %m");
3397 else {
3398 if (keyctl(KEYCTL_SETPERM, key,
3399 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3400 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3401 r = log_exec_error_errno(context,
3402 p,
3403 errno,
3404 "Failed to restrict invocation ID permission: %m");
3405 }
3406 }
3407
3408out:
3409 /* Revert back uid & gid for the last time, and exit */
3410 /* no extra logging, as only the first already reported error matters */
3411 if (getuid() != saved_uid)
3412 (void) setreuid(saved_uid, -1);
3413
3414 if (getgid() != saved_gid)
3415 (void) setregid(saved_gid, -1);
3416
3417 return r;
3418}
3419
3420static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3421 assert(array);
3422 assert(n);
3423 assert(pair);
3424
3425 if (pair[0] >= 0)
3426 array[(*n)++] = pair[0];
3427 if (pair[1] >= 0)
3428 array[(*n)++] = pair[1];
3429}
3430
3431static int close_remaining_fds(
3432 const ExecParameters *params,
3433 const ExecRuntime *runtime,
3434 int socket_fd,
3435 const int *fds, size_t n_fds) {
3436
3437 size_t n_dont_close = 0;
3438 int dont_close[n_fds + 14];
3439
3440 assert(params);
3441
3442 if (params->stdin_fd >= 0)
3443 dont_close[n_dont_close++] = params->stdin_fd;
3444 if (params->stdout_fd >= 0)
3445 dont_close[n_dont_close++] = params->stdout_fd;
3446 if (params->stderr_fd >= 0)
3447 dont_close[n_dont_close++] = params->stderr_fd;
3448
3449 if (socket_fd >= 0)
3450 dont_close[n_dont_close++] = socket_fd;
3451 if (n_fds > 0) {
3452 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3453 n_dont_close += n_fds;
3454 }
3455
3456 if (runtime)
3457 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3458
3459 if (runtime && runtime->shared) {
3460 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3461 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3462 }
3463
3464 if (runtime && runtime->dynamic_creds) {
3465 if (runtime->dynamic_creds->user)
3466 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3467 if (runtime->dynamic_creds->group)
3468 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3469 }
3470
3471 if (params->user_lookup_fd >= 0)
3472 dont_close[n_dont_close++] = params->user_lookup_fd;
3473
3474 return close_all_fds(dont_close, n_dont_close);
3475}
3476
3477static int send_user_lookup(
3478 const char *unit_id,
3479 int user_lookup_fd,
3480 uid_t uid,
3481 gid_t gid) {
3482
3483 assert(unit_id);
3484
3485 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3486 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3487 * specified. */
3488
3489 if (user_lookup_fd < 0)
3490 return 0;
3491
3492 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3493 return 0;
3494
3495 if (writev(user_lookup_fd,
3496 (struct iovec[]) {
3497 IOVEC_MAKE(&uid, sizeof(uid)),
3498 IOVEC_MAKE(&gid, sizeof(gid)),
3499 IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3500 return -errno;
3501
3502 return 0;
3503}
3504
3505static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3506 int r;
3507
3508 assert(c);
3509 assert(home);
3510 assert(buf);
3511
3512 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3513
3514 if (*home)
3515 return 0;
3516
3517 if (!c->working_directory_home)
3518 return 0;
3519
3520 r = get_home_dir(buf);
3521 if (r < 0)
3522 return r;
3523
3524 *home = *buf;
3525 return 1;
3526}
3527
3528static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3529 _cleanup_strv_free_ char ** list = NULL;
3530 int r;
3531
3532 assert(c);
3533 assert(p);
3534 assert(ret);
3535
3536 assert(c->dynamic_user);
3537
3538 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3539 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3540 * directories. */
3541
3542 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3543 if (t == EXEC_DIRECTORY_CONFIGURATION)
3544 continue;
3545
3546 if (!p->prefix[t])
3547 continue;
3548
3549 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3550 char *e;
3551
3552 if (exec_directory_is_private(c, t))
3553 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3554 else
3555 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3556 if (!e)
3557 return -ENOMEM;
3558
3559 r = strv_consume(&list, e);
3560 if (r < 0)
3561 return r;
3562 }
3563 }
3564
3565 *ret = TAKE_PTR(list);
3566
3567 return 0;
3568}
3569
3570static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3571 _cleanup_(cpu_set_reset) CPUSet s = {};
3572 int r;
3573
3574 assert(c);
3575 assert(ret);
3576
3577 if (!c->numa_policy.nodes.set) {
3578 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3579 return 0;
3580 }
3581
3582 r = numa_to_cpu_set(&c->numa_policy, &s);
3583 if (r < 0)
3584 return r;
3585
3586 cpu_set_reset(ret);
3587
3588 return cpu_set_add_all(ret, &s);
3589}
3590
5a5fdfe3 3591static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
75689fb2
LB
3592 int r;
3593
3594 assert(fds);
3595 assert(n_fds);
3596 assert(*n_fds < fds_size);
5a5fdfe3 3597 assert(fd);
75689fb2 3598
5a5fdfe3
MY
3599 if (*fd < 0)
3600 return 0;
75689fb2 3601
5a5fdfe3 3602 if (*fd < 3 + (int) *n_fds) {
75689fb2
LB
3603 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3604 * the fds we pass to the process (or which are closed only during execve). */
3605
5a5fdfe3 3606 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
75689fb2
LB
3607 if (r < 0)
3608 return -errno;
3609
5a5fdfe3 3610 close_and_replace(*fd, r);
75689fb2
LB
3611 }
3612
5a5fdfe3 3613 fds[(*n_fds)++] = *fd;
75689fb2
LB
3614 return 1;
3615}
3616
3617static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3618 union sockaddr_union addr = {
3619 .un.sun_family = AF_UNIX,
3620 };
3621 socklen_t sa_len;
3622 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3623 int r;
3624
3625 assert(c);
3626 assert(p);
3627 assert(of);
3628 assert(ofd >= 0);
3629
3630 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3631 if (r < 0)
3632 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3633
3634 sa_len = r;
3635
3636 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3637 _cleanup_close_ int fd = -EBADF;
3638
3639 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3640 if (fd < 0)
3641 return log_exec_error_errno(c,
3642 p,
3643 errno,
3644 "Failed to create socket for %s: %m",
3645 of->path);
3646
3647 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3648 if (r == -EPROTOTYPE)
3649 continue;
3650 if (r < 0)
3651 return log_exec_error_errno(c,
3652 p,
3653 r,
3654 "Failed to connect socket for %s: %m",
3655 of->path);
3656
3657 return TAKE_FD(fd);
3658 }
3659
3660 return log_exec_error_errno(c,
3661 p,
3662 SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3663 of->path);
3664}
3665
3666static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3667 struct stat st;
3668 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3669
3670 assert(c);
3671 assert(p);
3672 assert(of);
3673
3674 ofd = open(of->path, O_PATH | O_CLOEXEC);
3675 if (ofd < 0)
3676 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3677
3678 if (fstat(ofd, &st) < 0)
3679 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3680
3681 if (S_ISSOCK(st.st_mode)) {
3682 fd = connect_unix_harder(c, p, of, ofd);
3683 if (fd < 0)
3684 return fd;
3685
3686 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3687 return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3688 of->path);
3689
3690 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3691 } else {
3692 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3693 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3694 flags |= O_APPEND;
3695 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3696 flags |= O_TRUNC;
3697
3698 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3699 if (fd < 0)
3700 return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3701
3702 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3703 }
3704
3705 return TAKE_FD(fd);
3706}
3707
1eeaa93d 3708static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
75689fb2
LB
3709 int r;
3710
3711 assert(c);
3712 assert(p);
75689fb2
LB
3713 assert(n_fds);
3714
3715 LIST_FOREACH(open_files, of, p->open_files) {
3716 _cleanup_close_ int fd = -EBADF;
3717
3718 fd = get_open_file_fd(c, p, of);
3719 if (fd < 0) {
3720 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3721 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3722 continue;
3723 }
3724
3725 return fd;
3726 }
3727
1eeaa93d 3728 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
75689fb2
LB
3729 return -ENOMEM;
3730
1eeaa93d 3731 r = strv_extend(&p->fd_names, of->fdname);
75689fb2
LB
3732 if (r < 0)
3733 return r;
3734
1eeaa93d 3735 p->fds[*n_fds] = TAKE_FD(fd);
75689fb2
LB
3736
3737 (*n_fds)++;
3738 }
3739
3740 return 0;
3741}
3742
3743static void log_command_line(
3744 const ExecContext *context,
3745 const ExecParameters *params,
3746 const char *msg,
3747 const char *executable,
3748 char **argv) {
3749
3750 assert(context);
3751 assert(params);
3752 assert(msg);
3753 assert(executable);
3754
3755 if (!DEBUG_LOGGING)
3756 return;
3757
3758 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3759
3760 log_exec_struct(context, params, LOG_DEBUG,
3761 "EXECUTABLE=%s", executable,
3762 LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3763 LOG_EXEC_INVOCATION_ID(params));
3764}
3765
3766static bool exec_context_need_unprivileged_private_users(
3767 const ExecContext *context,
3768 const ExecParameters *params) {
3769
3770 assert(context);
3771 assert(params);
3772
3773 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3774 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3775 * (system manager) then we have privileges and don't need this. */
3776 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3777 return false;
3778
3779 return context->private_users ||
3780 context->private_tmp ||
3781 context->private_devices ||
3782 context->private_network ||
3783 context->network_namespace_path ||
3784 context->private_ipc ||
3785 context->ipc_namespace_path ||
3786 context->private_mounts > 0 ||
3787 context->mount_apivfs ||
3788 context->n_bind_mounts > 0 ||
3789 context->n_temporary_filesystems > 0 ||
3790 context->root_directory ||
3791 !strv_isempty(context->extension_directories) ||
3792 context->protect_system != PROTECT_SYSTEM_NO ||
3793 context->protect_home != PROTECT_HOME_NO ||
3794 context->protect_kernel_tunables ||
3795 context->protect_kernel_modules ||
3796 context->protect_kernel_logs ||
3797 context->protect_control_groups ||
3798 context->protect_clock ||
3799 context->protect_hostname ||
3800 !strv_isempty(context->read_write_paths) ||
3801 !strv_isempty(context->read_only_paths) ||
3802 !strv_isempty(context->inaccessible_paths) ||
3803 !strv_isempty(context->exec_paths) ||
3804 !strv_isempty(context->no_exec_paths);
3805}
3806
3807static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3808 assert(context);
3809
3810 if (confirm_spawn_disabled())
3811 return false;
3812
3813 /* For some reasons units remaining in the same process group
3814 * as PID 1 fail to acquire the console even if it's not used
3815 * by any process. So skip the confirmation question for them. */
3816 return !context->same_pgrp;
3817}
3818
3819static int exec_context_named_iofds(
3820 const ExecContext *c,
3821 const ExecParameters *p,
3822 int named_iofds[static 3]) {
3823
3824 size_t targets;
3825 const char* stdio_fdname[3];
3826 size_t n_fds;
3827
3828 assert(c);
3829 assert(p);
3830 assert(named_iofds);
3831
3832 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3833 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3834 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3835
3836 for (size_t i = 0; i < 3; i++)
3837 stdio_fdname[i] = exec_context_fdname(c, i);
3838
3839 n_fds = p->n_storage_fds + p->n_socket_fds;
3840
3841 for (size_t i = 0; i < n_fds && targets > 0; i++)
3842 if (named_iofds[STDIN_FILENO] < 0 &&
3843 c->std_input == EXEC_INPUT_NAMED_FD &&
3844 stdio_fdname[STDIN_FILENO] &&
3845 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3846
3847 named_iofds[STDIN_FILENO] = p->fds[i];
3848 targets--;
3849
3850 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3851 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3852 stdio_fdname[STDOUT_FILENO] &&
3853 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3854
3855 named_iofds[STDOUT_FILENO] = p->fds[i];
3856 targets--;
3857
3858 } else if (named_iofds[STDERR_FILENO] < 0 &&
3859 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3860 stdio_fdname[STDERR_FILENO] &&
3861 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3862
3863 named_iofds[STDERR_FILENO] = p->fds[i];
3864 targets--;
3865 }
3866
3867 return targets == 0 ? 0 : -ENOENT;
3868}
3869
7b6d3dcd
LB
3870static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3871 if (!shared)
3872 return;
3873
3874 safe_close_pair(shared->netns_storage_socket);
3875 safe_close_pair(shared->ipcns_storage_socket);
3876}
3877
3878static void exec_runtime_close(ExecRuntime *rt) {
3879 if (!rt)
3880 return;
3881
3882 safe_close_pair(rt->ephemeral_storage_socket);
3883
3884 exec_shared_runtime_close(rt->shared);
3885 dynamic_creds_close(rt->dynamic_creds);
3886}
3887
3888static void exec_params_close(ExecParameters *p) {
3889 if (!p)
3890 return;
3891
3892 p->stdin_fd = safe_close(p->stdin_fd);
3893 p->stdout_fd = safe_close(p->stdout_fd);
3894 p->stderr_fd = safe_close(p->stderr_fd);
3895}
3896
75689fb2
LB
3897int exec_invoke(
3898 const ExecCommand *command,
3899 const ExecContext *context,
3900 ExecParameters *params,
3901 ExecRuntime *runtime,
3902 const CGroupContext *cgroup_context,
3903 int *exit_status) {
3904
3905 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
5a5fdfe3 3906 int r, ngids = 0;
75689fb2
LB
3907 _cleanup_free_ gid_t *supplementary_gids = NULL;
3908 const char *username = NULL, *groupname = NULL;
3909 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3910 const char *home = NULL, *shell = NULL;
3911 char **final_argv = NULL;
3912 dev_t journal_stream_dev = 0;
3913 ino_t journal_stream_ino = 0;
3914 bool userns_set_up = false;
3915 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3916 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3917 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3918 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
24832d10 3919 bool keep_seccomp_privileges = false;
75689fb2
LB
3920#if HAVE_SELINUX
3921 _cleanup_free_ char *mac_selinux_context_net = NULL;
3922 bool use_selinux = false;
3923#endif
3924#if ENABLE_SMACK
3925 bool use_smack = false;
3926#endif
3927#if HAVE_APPARMOR
3928 bool use_apparmor = false;
24832d10
ILG
3929#endif
3930#if HAVE_SECCOMP
3931 uint64_t saved_bset = 0;
75689fb2
LB
3932#endif
3933 uid_t saved_uid = getuid();
3934 gid_t saved_gid = getgid();
3935 uid_t uid = UID_INVALID;
3936 gid_t gid = GID_INVALID;
3937 size_t n_fds, /* fds to pass to the child */
3938 n_keep_fds; /* total number of fds not to close */
3939 int secure_bits;
3940 _cleanup_free_ gid_t *gids_after_pam = NULL;
3941 int ngids_after_pam = 0;
75689fb2 3942
1eeaa93d
LB
3943 int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
3944 size_t n_storage_fds, n_socket_fds;
75689fb2
LB
3945
3946 assert(command);
3947 assert(context);
3948 assert(params);
3949 assert(exit_status);
3950
cc9f4cad
LB
3951 if (context->log_level_max >= 0)
3952 log_set_max_level(context->log_level_max);
3953
75689fb2 3954 /* Explicitly test for CVE-2021-4034 inspired invocations */
856bed0a
LB
3955 if (!command->path || strv_isempty(command->argv)) {
3956 *exit_status = EXIT_EXEC;
3957 return log_exec_error_errno(
3958 context,
3959 params,
3960 SYNTHETIC_ERRNO(EINVAL),
3961 "Invalid command line arguments.");
3962 }
75689fb2
LB
3963
3964 LOG_CONTEXT_PUSH_EXEC(context, params);
3965
3966 if (context->std_input == EXEC_INPUT_SOCKET ||
3967 context->std_output == EXEC_OUTPUT_SOCKET ||
3968 context->std_error == EXEC_OUTPUT_SOCKET) {
3969
3970 if (params->n_socket_fds > 1)
3971 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
3972
3973 if (params->n_socket_fds == 0)
3974 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
3975
3976 socket_fd = params->fds[0];
1eeaa93d 3977 n_storage_fds = n_socket_fds = 0;
75689fb2 3978 } else {
75689fb2
LB
3979 n_socket_fds = params->n_socket_fds;
3980 n_storage_fds = params->n_storage_fds;
3981 }
3982 n_fds = n_socket_fds + n_storage_fds;
3983
3984 r = exec_context_named_iofds(context, params, named_iofds);
3985 if (r < 0)
3986 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
3987
3988 rename_process_from_path(command->path);
3989
3990 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3991 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3992 * both of which will be demoted to SIG_DFL. */
3993 (void) default_signals(SIGNALS_CRASH_HANDLER,
3994 SIGNALS_IGNORE);
3995
3996 if (context->ignore_sigpipe)
3997 (void) ignore_signals(SIGPIPE);
3998
3999 r = reset_signal_mask();
4000 if (r < 0) {
4001 *exit_status = EXIT_SIGNAL_MASK;
4002 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4003 }
4004
4005 if (params->idle_pipe)
4006 do_idle_pipe_dance(params->idle_pipe);
4007
4008 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4009 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4010 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4011 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4012
4013 log_forget_fds();
4014 log_set_open_when_needed(true);
4015 log_settle_target();
75689fb2
LB
4016
4017 /* In case anything used libc syslog(), close this here, too */
4018 closelog();
4019
1eeaa93d 4020 r = collect_open_file_fds(context, params, &n_fds);
75689fb2
LB
4021 if (r < 0) {
4022 *exit_status = EXIT_FDS;
4023 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4024 }
4025
4026 int keep_fds[n_fds + 3];
1eeaa93d 4027 memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
75689fb2
LB
4028 n_keep_fds = n_fds;
4029
5a5fdfe3 4030 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
75689fb2
LB
4031 if (r < 0) {
4032 *exit_status = EXIT_FDS;
5a5fdfe3 4033 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
75689fb2
LB
4034 }
4035
4036#if HAVE_LIBBPF
5a5fdfe3
MY
4037 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_outer_map_fd);
4038 if (r < 0) {
4039 *exit_status = EXIT_FDS;
4040 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
75689fb2
LB
4041 }
4042#endif
4043
4044 r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4045 if (r < 0) {
4046 *exit_status = EXIT_FDS;
4047 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4048 }
4049
4050 if (!context->same_pgrp &&
4051 setsid() < 0) {
4052 *exit_status = EXIT_SETSID;
4053 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4054 }
4055
4056 exec_context_tty_reset(context, params);
4057
4058 if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4059 _cleanup_free_ char *cmdline = NULL;
4060
4061 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4062 if (!cmdline) {
4063 *exit_status = EXIT_MEMORY;
4064 return log_oom();
4065 }
4066
4067 r = ask_for_confirmation(context, params, cmdline);
4068 if (r != CONFIRM_EXECUTE) {
4069 if (r == CONFIRM_PRETEND_SUCCESS) {
4070 *exit_status = EXIT_SUCCESS;
4071 return 0;
4072 }
4073
4074 *exit_status = EXIT_CONFIRM;
4075 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4076 "Execution cancelled by the user");
4077 }
4078 }
4079
4080 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4081 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4082 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4083 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4084 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4085 if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4086 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4087 *exit_status = EXIT_MEMORY;
4088 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4089 }
4090
4091 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4092 _cleanup_strv_free_ char **suggested_paths = NULL;
4093
4094 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4095 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4096 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4097 *exit_status = EXIT_USER;
4098 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4099 }
4100
4101 r = compile_suggested_paths(context, params, &suggested_paths);
4102 if (r < 0) {
4103 *exit_status = EXIT_MEMORY;
4104 return log_oom();
4105 }
4106
4107 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4108 if (r < 0) {
4109 *exit_status = EXIT_USER;
4110 if (r == -EILSEQ)
4111 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4112 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4113 return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4114 }
4115
4116 if (!uid_is_valid(uid)) {
4117 *exit_status = EXIT_USER;
4118 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4119 }
4120
4121 if (!gid_is_valid(gid)) {
4122 *exit_status = EXIT_USER;
4123 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4124 }
4125
4126 if (runtime->dynamic_creds->user)
4127 username = runtime->dynamic_creds->user->name;
4128
4129 } else {
4130 if (context->user) {
4131 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4132 if (r < 0) {
4133 *exit_status = EXIT_USER;
4134 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4135 }
4136 }
4137
4138 if (context->group) {
4139 r = get_fixed_group(context->group, &groupname, &gid);
4140 if (r < 0) {
4141 *exit_status = EXIT_GROUP;
4142 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4143 }
4144 }
4145 }
4146
4147 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4148 r = get_supplementary_groups(context, username, groupname, gid,
4149 &supplementary_gids, &ngids);
4150 if (r < 0) {
4151 *exit_status = EXIT_GROUP;
4152 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4153 }
4154
4155 r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4156 if (r < 0) {
4157 *exit_status = EXIT_USER;
4158 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4159 }
4160
4161 params->user_lookup_fd = safe_close(params->user_lookup_fd);
4162
4163 r = acquire_home(context, uid, &home, &home_buffer);
4164 if (r < 0) {
4165 *exit_status = EXIT_CHDIR;
4166 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4167 }
4168
4169 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4170 if (socket_fd >= 0)
4171 (void) fd_nonblock(socket_fd, false);
4172
4173 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4174 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4175 if (params->cgroup_path) {
4176 _cleanup_free_ char *p = NULL;
4177
4178 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4179 if (r < 0) {
4180 *exit_status = EXIT_CGROUP;
4181 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4182 }
4183
4184 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4185 if (r == -EUCLEAN) {
4186 *exit_status = EXIT_CGROUP;
4187 return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4188 "because the cgroup or one of its parents or "
4189 "siblings is in the threaded mode: %m", p);
4190 }
4191 if (r < 0) {
4192 *exit_status = EXIT_CGROUP;
4193 return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4194 }
4195 }
4196
4197 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4198 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4199 if (r < 0) {
4200 *exit_status = EXIT_NETWORK;
4201 return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4202 }
4203 }
4204
4205 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4206 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4207 if (r < 0) {
4208 *exit_status = EXIT_NAMESPACE;
4209 return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4210 }
4211 }
4212
4213 r = setup_input(context, params, socket_fd, named_iofds);
4214 if (r < 0) {
4215 *exit_status = EXIT_STDIN;
4216 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4217 }
4218
4219 r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4220 if (r < 0) {
4221 *exit_status = EXIT_STDOUT;
4222 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4223 }
4224
4225 r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4226 if (r < 0) {
4227 *exit_status = EXIT_STDERR;
4228 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4229 }
4230
4231 if (context->oom_score_adjust_set) {
4232 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4233 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4234 r = set_oom_score_adjust(context->oom_score_adjust);
4235 if (ERRNO_IS_NEG_PRIVILEGE(r))
4236 log_exec_debug_errno(context, params, r,
4237 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4238 else if (r < 0) {
4239 *exit_status = EXIT_OOM_ADJUST;
4240 return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4241 }
4242 }
4243
4244 if (context->coredump_filter_set) {
4245 r = set_coredump_filter(context->coredump_filter);
4246 if (ERRNO_IS_NEG_PRIVILEGE(r))
4247 log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4248 else if (r < 0) {
4249 *exit_status = EXIT_LIMITS;
4250 return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4251 }
4252 }
4253
4254 if (context->nice_set) {
4255 r = setpriority_closest(context->nice);
4256 if (r < 0) {
4257 *exit_status = EXIT_NICE;
4258 return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4259 }
4260 }
4261
4262 if (context->cpu_sched_set) {
4263 struct sched_param param = {
4264 .sched_priority = context->cpu_sched_priority,
4265 };
4266
4267 r = sched_setscheduler(0,
4268 context->cpu_sched_policy |
4269 (context->cpu_sched_reset_on_fork ?
4270 SCHED_RESET_ON_FORK : 0),
4271 &param);
4272 if (r < 0) {
4273 *exit_status = EXIT_SETSCHEDULER;
4274 return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4275 }
4276 }
4277
4278 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4279 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4280 const CPUSet *cpu_set;
4281
4282 if (context->cpu_affinity_from_numa) {
4283 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4284 if (r < 0) {
4285 *exit_status = EXIT_CPUAFFINITY;
4286 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4287 }
4288
4289 cpu_set = &converted_cpu_set;
4290 } else
4291 cpu_set = &context->cpu_set;
4292
4293 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4294 *exit_status = EXIT_CPUAFFINITY;
4295 return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4296 }
4297 }
4298
4299 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4300 r = apply_numa_policy(&context->numa_policy);
4301 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4302 log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4303 else if (r < 0) {
4304 *exit_status = EXIT_NUMA_POLICY;
4305 return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4306 }
4307 }
4308
4309 if (context->ioprio_set)
4310 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4311 *exit_status = EXIT_IOPRIO;
4312 return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4313 }
4314
4315 if (context->timer_slack_nsec != NSEC_INFINITY)
4316 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4317 *exit_status = EXIT_TIMERSLACK;
4318 return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4319 }
4320
4321 if (context->personality != PERSONALITY_INVALID) {
4322 r = safe_personality(context->personality);
4323 if (r < 0) {
4324 *exit_status = EXIT_PERSONALITY;
4325 return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4326 }
4327 }
4328
de3612db 4329#if ENABLE_UTMP
75689fb2
LB
4330 if (context->utmp_id) {
4331 const char *line = context->tty_path ?
4332 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4333 NULL;
4334 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4335 line,
4336 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4337 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4338 USER_PROCESS,
4339 username);
4340 }
de3612db 4341#endif
75689fb2
LB
4342
4343 if (uid_is_valid(uid)) {
4344 r = chown_terminal(STDIN_FILENO, uid);
4345 if (r < 0) {
4346 *exit_status = EXIT_STDIN;
4347 return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4348 }
4349 }
4350
4351 if (params->cgroup_path) {
4352 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4353 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4354 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4355 * touch a single hierarchy too. */
4356
4357 if (params->flags & EXEC_CGROUP_DELEGATE) {
4358 _cleanup_free_ char *p = NULL;
4359
4360 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4361 if (r < 0) {
4362 *exit_status = EXIT_CGROUP;
4363 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4364 }
4365
4366 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4367 if (r < 0) {
4368 *exit_status = EXIT_CGROUP;
4369 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4370 }
4371 if (r > 0) {
4372 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4373 if (r < 0) {
4374 *exit_status = EXIT_CGROUP;
4375 return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4376 }
4377 }
4378 }
4379
4380 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4381 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4382 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4383 if (r < 0) {
4384 *exit_status = EXIT_MEMORY;
4385 return log_oom();
4386 }
4387
4388 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4389 if (r < 0) {
4390 log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4391 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4392 memory_pressure_path = mfree(memory_pressure_path);
4393 }
4394 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4395 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4396 if (!memory_pressure_path) {
4397 *exit_status = EXIT_MEMORY;
4398 return log_oom();
4399 }
4400 }
4401 }
4402 }
4403
4404 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4405
4406 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4407 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4408 if (r < 0)
4409 return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4410 }
4411
4412 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4413 r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4414 if (r < 0) {
4415 *exit_status = EXIT_CREDENTIALS;
4416 return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4417 }
4418 }
4419
4420 r = build_environment(
4421 context,
4422 params,
4423 cgroup_context,
4424 n_fds,
75689fb2
LB
4425 home,
4426 username,
4427 shell,
4428 journal_stream_dev,
4429 journal_stream_ino,
4430 memory_pressure_path,
4431 &our_env);
4432 if (r < 0) {
4433 *exit_status = EXIT_MEMORY;
4434 return log_oom();
4435 }
4436
4437 r = build_pass_environment(context, &pass_env);
4438 if (r < 0) {
4439 *exit_status = EXIT_MEMORY;
4440 return log_oom();
4441 }
4442
4443 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4444 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4445 * not specify PATH but the unit has ExecSearchPath. */
4446 if (!strv_isempty(context->exec_search_path)) {
4447 _cleanup_free_ char *joined = NULL;
4448
4449 joined = strv_join(context->exec_search_path, ":");
4450 if (!joined) {
4451 *exit_status = EXIT_MEMORY;
4452 return log_oom();
4453 }
4454
4455 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4456 if (r < 0) {
4457 *exit_status = EXIT_MEMORY;
4458 return log_oom();
4459 }
4460 }
4461
4462 accum_env = strv_env_merge(params->environment,
4463 our_env,
4464 joined_exec_search_path,
4465 pass_env,
4466 context->environment,
4467 params->files_env);
4468 if (!accum_env) {
4469 *exit_status = EXIT_MEMORY;
4470 return log_oom();
4471 }
4472 accum_env = strv_env_clean(accum_env);
4473
4474 (void) umask(context->umask);
4475
4476 r = setup_keyring(context, params, uid, gid);
4477 if (r < 0) {
4478 *exit_status = EXIT_KEYRING;
4479 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4480 }
4481
4482 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4483 * from it. */
4484 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4485
4486 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4487 * for it, and the kernel doesn't actually support ambient caps. */
4488 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4489
4490 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4491 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4492 * desired. */
4493 if (needs_ambient_hack)
4494 needs_setuid = false;
4495 else
4496 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4497
4498 uint64_t capability_ambient_set = context->capability_ambient_set;
4499
4500 if (needs_sandboxing) {
4501 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4502 * /sys being present. The actual MAC context application will happen later, as late as
4503 * possible, to avoid impacting our own code paths. */
4504
4505#if HAVE_SELINUX
4506 use_selinux = mac_selinux_use();
4507#endif
4508#if ENABLE_SMACK
4509 use_smack = mac_smack_use();
4510#endif
4511#if HAVE_APPARMOR
4512 use_apparmor = mac_apparmor_use();
4513#endif
4514 }
4515
4516 if (needs_sandboxing) {
4517 int which_failed;
4518
4519 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4520 * is set here. (See below.) */
4521
4522 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4523 if (r < 0) {
4524 *exit_status = EXIT_LIMITS;
4525 return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4526 }
4527 }
4528
4529 if (needs_setuid && context->pam_name && username) {
6634e66d 4530 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
75689fb2
LB
4531 * wins here. (See above.) */
4532
4533 /* All fds passed in the fds array will be closed in the pam child process. */
1eeaa93d 4534 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds);
75689fb2
LB
4535 if (r < 0) {
4536 *exit_status = EXIT_PAM;
4537 return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4538 }
4539
4540 if (ambient_capabilities_supported()) {
4541 uint64_t ambient_after_pam;
4542
4543 /* PAM modules might have set some ambient caps. Query them here and merge them into
4544 * the caps we want to set in the end, so that we don't end up unsetting them. */
4545 r = capability_get_ambient(&ambient_after_pam);
4546 if (r < 0) {
4547 *exit_status = EXIT_CAPABILITIES;
4548 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4549 }
4550
4551 capability_ambient_set |= ambient_after_pam;
4552 }
4553
4554 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4555 if (ngids_after_pam < 0) {
dbc0342e 4556 *exit_status = EXIT_GROUP;
75689fb2
LB
4557 return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4558 }
4559 }
4560
4561 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4562 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4563 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
6634e66d 4564 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
75689fb2
LB
4565
4566 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4567 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4568 * the actual requested operations fail (or silently continue). */
4569 if (r < 0 && context->private_users) {
4570 *exit_status = EXIT_USER;
4571 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4572 }
4573 if (r < 0)
4574 log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4575 else
4576 userns_set_up = true;
4577 }
4578
4579 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4580
4581 /* Try to enable network namespacing if network namespacing is available and we have
4582 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4583 * new network namespace. And if we don't have that, then we could only create a network
4584 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4585 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4586 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4587 if (ERRNO_IS_NEG_PRIVILEGE(r))
4588 log_exec_notice_errno(context, params, r,
4589 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4590 else if (r < 0) {
4591 *exit_status = EXIT_NETWORK;
4592 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4593 }
4594 } else if (context->network_namespace_path) {
4595 *exit_status = EXIT_NETWORK;
4596 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4597 "NetworkNamespacePath= is not supported, refusing.");
4598 } else
4599 log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4600 }
4601
4602 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4603
4604 if (ns_type_supported(NAMESPACE_IPC)) {
4605 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4606 if (r == -EPERM)
4607 log_exec_warning_errno(context, params, r,
4608 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4609 else if (r < 0) {
4610 *exit_status = EXIT_NAMESPACE;
4611 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4612 }
4613 } else if (context->ipc_namespace_path) {
4614 *exit_status = EXIT_NAMESPACE;
4615 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4616 "IPCNamespacePath= is not supported, refusing.");
4617 } else
4618 log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4619 }
4620
4621 if (needs_mount_namespace) {
4622 _cleanup_free_ char *error_path = NULL;
4623
4624 r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path);
4625 if (r < 0) {
4626 *exit_status = EXIT_NAMESPACE;
4627 return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4628 error_path ? ": " : "", strempty(error_path));
4629 }
4630 }
4631
4632 if (needs_sandboxing) {
4633 r = apply_protect_hostname(context, params, exit_status);
4634 if (r < 0)
4635 return r;
4636 }
4637
4638 if (context->memory_ksm >= 0)
4639 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4640 if (ERRNO_IS_NOT_SUPPORTED(errno))
4641 log_exec_debug_errno(context,
4642 params,
4643 errno,
4644 "KSM support not available, ignoring.");
4645 else {
4646 *exit_status = EXIT_KSM;
4647 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4648 }
4649 }
4650
4651 /* Drop groups as early as possible.
6634e66d 4652 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
75689fb2
LB
4653 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4654 if (needs_setuid) {
4655 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4656 int ngids_to_enforce = 0;
4657
4658 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4659 ngids,
4660 gids_after_pam,
4661 ngids_after_pam,
4662 &gids_to_enforce);
4663 if (ngids_to_enforce < 0) {
dbc0342e 4664 *exit_status = EXIT_GROUP;
75689fb2
LB
4665 return log_exec_error_errno(context, params,
4666 ngids_to_enforce,
4667 "Failed to merge group lists. Group membership might be incorrect: %m");
4668 }
4669
4670 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4671 if (r < 0) {
4672 *exit_status = EXIT_GROUP;
4673 return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4674 }
4675 }
4676
4677 /* If the user namespace was not set up above, try to do it now.
4678 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4679 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4680 * case of mount namespaces being less privileged when the mount point list is copied from a
4681 * different user namespace). */
4682
4683 if (needs_sandboxing && context->private_users && !userns_set_up) {
4684 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4685 if (r < 0) {
4686 *exit_status = EXIT_USER;
4687 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4688 }
4689 }
4690
4691 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4692 * shall execute. */
4693
4694 _cleanup_free_ char *executable = NULL;
4695 _cleanup_close_ int executable_fd = -EBADF;
4696 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4697 if (r < 0) {
4698 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4699 log_exec_struct_errno(context, params, LOG_INFO, r,
4700 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4701 LOG_EXEC_INVOCATION_ID(params),
4702 LOG_EXEC_MESSAGE(params,
4703 "Executable %s missing, skipping: %m",
4704 command->path),
4705 "EXECUTABLE=%s", command->path);
4706 *exit_status = EXIT_SUCCESS;
4707 return 0;
4708 }
4709
4710 *exit_status = EXIT_EXEC;
4711 return log_exec_struct_errno(context, params, LOG_INFO, r,
4712 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4713 LOG_EXEC_INVOCATION_ID(params),
4714 LOG_EXEC_MESSAGE(params,
4715 "Failed to locate executable %s: %m",
4716 command->path),
4717 "EXECUTABLE=%s", command->path);
4718 }
4719
5a5fdfe3 4720 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
75689fb2
LB
4721 if (r < 0) {
4722 *exit_status = EXIT_FDS;
5a5fdfe3 4723 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
75689fb2
LB
4724 }
4725
4726#if HAVE_SELINUX
4727 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4728 int fd = -EBADF;
4729
4730 if (socket_fd >= 0)
4731 fd = socket_fd;
4732 else if (params->n_socket_fds == 1)
4733 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4734 * use context from that fd to compute the label. */
4735 fd = params->fds[0];
4736
4737 if (fd >= 0) {
4738 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4739 if (r < 0) {
4740 if (!context->selinux_context_ignore) {
4741 *exit_status = EXIT_SELINUX_CONTEXT;
4742 return log_exec_error_errno(context,
4743 params,
4744 r,
4745 "Failed to determine SELinux context: %m");
4746 }
4747 log_exec_debug_errno(context,
4748 params,
4749 r,
4750 "Failed to determine SELinux context, ignoring: %m");
4751 }
4752 }
4753 }
4754#endif
4755
4756 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4757 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4758 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
7b6d3dcd
LB
4759 * execve(). But first, close the remaining sockets in the context objects. */
4760
4761 exec_runtime_close(runtime);
4762 exec_params_close(params);
75689fb2
LB
4763
4764 r = close_all_fds(keep_fds, n_keep_fds);
4765 if (r >= 0)
1eeaa93d 4766 r = shift_fds(params->fds, n_fds);
75689fb2 4767 if (r >= 0)
1eeaa93d 4768 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
75689fb2
LB
4769 if (r < 0) {
4770 *exit_status = EXIT_FDS;
4771 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4772 }
4773
4774 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4775 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4776 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4777 * came this far. */
4778
4779 secure_bits = context->secure_bits;
4780
4781 if (needs_sandboxing) {
4782 uint64_t bset;
4783
4784 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4785 * (Note this is placed after the general resource limit initialization, see above, in order
4786 * to take precedence.) */
4787 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4788 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4789 *exit_status = EXIT_LIMITS;
4790 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4791 }
4792 }
4793
4794#if ENABLE_SMACK
4795 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4796 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4797 if (use_smack && context->smack_process_label) {
4798 r = setup_smack(params, context, executable_fd);
4799 if (r < 0 && !context->smack_process_label_ignore) {
4800 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4801 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4802 }
4803 }
4804#endif
4805
4806 bset = context->capability_bounding_set;
4807 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4808 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4809 * instead of us doing that */
4810 if (needs_ambient_hack)
4811 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4812 (UINT64_C(1) << CAP_SETUID) |
4813 (UINT64_C(1) << CAP_SETGID);
4814
24832d10
ILG
4815#if HAVE_SECCOMP
4816 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4817 * keep the needed privileges to apply it even if we're not root. */
4818 if (needs_setuid &&
4819 uid_is_valid(uid) &&
4820 context_has_seccomp(context) &&
4821 seccomp_allows_drop_privileges(context)) {
4822 keep_seccomp_privileges = true;
4823
4824 if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4825 *exit_status = EXIT_USER;
4826 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4827 }
4828
4829 /* Save the current bounding set so we can restore it after applying the seccomp
4830 * filter */
4831 saved_bset = bset;
4832 bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4833 (UINT64_C(1) << CAP_SETPCAP);
4834 }
4835#endif
4836
75689fb2
LB
4837 if (!cap_test_all(bset)) {
4838 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4839 if (r < 0) {
4840 *exit_status = EXIT_CAPABILITIES;
4841 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4842 }
4843 }
4844
4845 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4846 * keep-caps set.
4847 *
4848 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4849 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4850 * the ambient capabilities can be raised as they are present in the permitted and
4851 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4852 * without changing the user, so we also set the ambient capabilities here.
4853 *
4854 * The requested ambient capabilities are raised in the inheritable set if the second
4855 * argument is true. */
4856 if (!needs_ambient_hack) {
4857 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4858 if (r < 0) {
4859 *exit_status = EXIT_CAPABILITIES;
4860 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4861 }
4862 }
4863 }
4864
4865 /* chroot to root directory first, before we lose the ability to chroot */
4866 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4867 if (r < 0)
4868 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
4869
4870 if (needs_setuid) {
4871 if (uid_is_valid(uid)) {
4872 r = enforce_user(context, uid, capability_ambient_set);
4873 if (r < 0) {
4874 *exit_status = EXIT_USER;
4875 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
4876 }
4877
24832d10 4878 if (keep_seccomp_privileges) {
f4a35f2a
LB
4879 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
4880 r = drop_capability(CAP_SETUID);
4881 if (r < 0) {
4882 *exit_status = EXIT_USER;
4883 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
4884 }
24832d10
ILG
4885 }
4886
4887 r = keep_capability(CAP_SYS_ADMIN);
4888 if (r < 0) {
4889 *exit_status = EXIT_USER;
4890 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
4891 }
4892
4893 r = keep_capability(CAP_SETPCAP);
4894 if (r < 0) {
4895 *exit_status = EXIT_USER;
4896 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
4897 }
4898 }
4899
75689fb2
LB
4900 if (!needs_ambient_hack && capability_ambient_set != 0) {
4901
4902 /* Raise the ambient capabilities after user change. */
4903 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4904 if (r < 0) {
4905 *exit_status = EXIT_CAPABILITIES;
4906 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
4907 }
4908 }
4909 }
4910 }
4911
4912 /* Apply working directory here, because the working directory might be on NFS and only the user running
4913 * this service might have the correct privilege to change to the working directory */
4914 r = apply_working_directory(context, params, runtime, home, exit_status);
4915 if (r < 0)
4916 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
4917
4918 if (needs_sandboxing) {
4919 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4920 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4921 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4922 * are restricted. */
4923
4924#if HAVE_SELINUX
4925 if (use_selinux) {
4926 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4927
4928 if (exec_context) {
4929 r = setexeccon(exec_context);
4930 if (r < 0) {
4931 if (!context->selinux_context_ignore) {
4932 *exit_status = EXIT_SELINUX_CONTEXT;
4933 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
4934 }
4935 log_exec_debug_errno(context,
4936 params,
4937 r,
4938 "Failed to change SELinux context to %s, ignoring: %m",
4939 exec_context);
4940 }
4941 }
4942 }
4943#endif
4944
4945#if HAVE_APPARMOR
4946 if (use_apparmor && context->apparmor_profile) {
4947 r = aa_change_onexec(context->apparmor_profile);
4948 if (r < 0 && !context->apparmor_profile_ignore) {
4949 *exit_status = EXIT_APPARMOR_PROFILE;
4950 return log_exec_error_errno(context,
4951 params,
4952 errno,
4953 "Failed to prepare AppArmor profile change to %s: %m",
4954 context->apparmor_profile);
4955 }
4956 }
4957#endif
4958
4959 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4960 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4961 * requires CAP_SETPCAP. */
4962 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4963 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4964 * effective set here.
4965 *
4966 * The effective set is overwritten during execve() with the following values:
4967 *
4968 * - ambient set (for non-root processes)
4969 *
4970 * - (inheritable | bounding) set for root processes)
4971 *
4972 * Hence there is no security impact to raise it in the effective set before execve
4973 */
4974 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4975 if (r < 0) {
4976 *exit_status = EXIT_CAPABILITIES;
4977 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4978 }
4979 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4980 *exit_status = EXIT_SECUREBITS;
4981 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
4982 }
4983 }
4984
4985 if (context_has_no_new_privileges(context))
4986 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4987 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4988 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
4989 }
4990
4991#if HAVE_SECCOMP
4992 r = apply_address_families(context, params);
4993 if (r < 0) {
4994 *exit_status = EXIT_ADDRESS_FAMILIES;
4995 return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
4996 }
4997
4998 r = apply_memory_deny_write_execute(context, params);
4999 if (r < 0) {
5000 *exit_status = EXIT_SECCOMP;
5001 return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5002 }
5003
5004 r = apply_restrict_realtime(context, params);
5005 if (r < 0) {
5006 *exit_status = EXIT_SECCOMP;
5007 return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5008 }
5009
5010 r = apply_restrict_suid_sgid(context, params);
5011 if (r < 0) {
5012 *exit_status = EXIT_SECCOMP;
5013 return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5014 }
5015
5016 r = apply_restrict_namespaces(context, params);
5017 if (r < 0) {
5018 *exit_status = EXIT_SECCOMP;
5019 return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5020 }
5021
5022 r = apply_protect_sysctl(context, params);
5023 if (r < 0) {
5024 *exit_status = EXIT_SECCOMP;
5025 return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5026 }
5027
5028 r = apply_protect_kernel_modules(context, params);
5029 if (r < 0) {
5030 *exit_status = EXIT_SECCOMP;
5031 return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5032 }
5033
5034 r = apply_protect_kernel_logs(context, params);
5035 if (r < 0) {
5036 *exit_status = EXIT_SECCOMP;
5037 return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5038 }
5039
5040 r = apply_protect_clock(context, params);
5041 if (r < 0) {
5042 *exit_status = EXIT_SECCOMP;
5043 return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5044 }
5045
5046 r = apply_private_devices(context, params);
5047 if (r < 0) {
5048 *exit_status = EXIT_SECCOMP;
5049 return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5050 }
5051
5052 r = apply_syscall_archs(context, params);
5053 if (r < 0) {
5054 *exit_status = EXIT_SECCOMP;
5055 return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5056 }
5057
5058 r = apply_lock_personality(context, params);
5059 if (r < 0) {
5060 *exit_status = EXIT_SECCOMP;
5061 return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5062 }
5063
5064 r = apply_syscall_log(context, params);
5065 if (r < 0) {
5066 *exit_status = EXIT_SECCOMP;
5067 return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5068 }
24832d10 5069#endif
75689fb2 5070
24832d10
ILG
5071#if HAVE_LIBBPF
5072 r = apply_restrict_filesystems(context, params);
5073 if (r < 0) {
5074 *exit_status = EXIT_BPF;
5075 return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5076 }
5077#endif
5078
5079#if HAVE_SECCOMP
86a1ee93 5080 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
75689fb2
LB
5081 * by the filter as little as possible. */
5082 r = apply_syscall_filter(context, params, needs_ambient_hack);
5083 if (r < 0) {
5084 *exit_status = EXIT_SECCOMP;
5085 return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5086 }
75689fb2 5087
24832d10
ILG
5088 if (keep_seccomp_privileges) {
5089 /* Restore the capability bounding set with what's expected from the service + the
5090 * ambient capabilities hack */
5091 if (!cap_test_all(saved_bset)) {
5092 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5093 if (r < 0) {
5094 *exit_status = EXIT_CAPABILITIES;
5095 return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5096 }
5097 }
5098
5099 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5100 * applications that use it. */
5101 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5102 r = drop_capability(CAP_SYS_ADMIN);
5103 if (r < 0) {
5104 *exit_status = EXIT_USER;
5105 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5106 }
5107 }
5108
5109 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5110 * applications that use it. */
5111 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5112 r = drop_capability(CAP_SETPCAP);
5113 if (r < 0) {
5114 *exit_status = EXIT_USER;
5115 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5116 }
5117 }
5118
5119 if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5120 *exit_status = EXIT_USER;
5121 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5122 }
75689fb2
LB
5123 }
5124#endif
5125
5126 }
5127
5128 if (!strv_isempty(context->unset_environment)) {
5129 char **ee = NULL;
5130
5131 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5132 if (!ee) {
5133 *exit_status = EXIT_MEMORY;
5134 return log_oom();
5135 }
5136
5137 strv_free_and_replace(accum_env, ee);
5138 }
5139
5140 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5141 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5142
5143 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5144 if (r < 0) {
5145 *exit_status = EXIT_MEMORY;
5146 return log_exec_error_errno(context,
5147 params,
5148 r,
5149 "Failed to replace environment variables: %m");
5150 }
5151 final_argv = replaced_argv;
5152
5153 if (!strv_isempty(unset_variables)) {
5154 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5155 log_exec_warning(context,
5156 params,
5157 "Referenced but unset environment variable evaluates to an empty string: %s",
5158 strna(ju));
5159 }
5160
5161 if (!strv_isempty(bad_variables)) {
5162 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5163 log_exec_warning(context,
5164 params,
5165 "Invalid environment variable name evaluates to an empty string: %s",
5166 strna(jb));
5167 }
5168 } else
5169 final_argv = command->argv;
5170
5171 log_command_line(context, params, "Executing", executable, final_argv);
5172
5a5fdfe3 5173 if (params->exec_fd >= 0) {
75689fb2
LB
5174 uint8_t hot = 1;
5175
5176 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5177 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5178
5a5fdfe3 5179 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
75689fb2
LB
5180 *exit_status = EXIT_EXEC;
5181 return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5182 }
5183 }
5184
5185 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5186
5a5fdfe3 5187 if (params->exec_fd >= 0) {
75689fb2
LB
5188 uint8_t hot = 0;
5189
5190 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5191 * that POLLHUP on it no longer means execve() succeeded. */
5192
5a5fdfe3 5193 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
75689fb2
LB
5194 *exit_status = EXIT_EXEC;
5195 return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5196 }
5197 }
5198
5199 *exit_status = EXIT_EXEC;
5200 return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5201}