]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/exec-invoke.c
Merge pull request #30609 from YHNdnzj/analyze-fdstore
[thirdparty/systemd.git] / src / core / exec-invoke.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/eventfd.h>
4 #include <sys/ioctl.h>
5 #include <sys/mount.h>
6 #include <sys/prctl.h>
7
8 #if HAVE_PAM
9 #include <security/pam_appl.h>
10 #include <security/pam_misc.h>
11 #endif
12
13 #if HAVE_APPARMOR
14 #include <sys/apparmor.h>
15 #endif
16
17 #include "sd-messages.h"
18
19 #if HAVE_APPARMOR
20 #include "apparmor-util.h"
21 #endif
22 #include "argv-util.h"
23 #include "barrier.h"
24 #include "bpf-dlopen.h"
25 #include "bpf-lsm.h"
26 #include "btrfs-util.h"
27 #include "capability-util.h"
28 #include "cgroup-setup.h"
29 #include "chase.h"
30 #include "chattr-util.h"
31 #include "chown-recursive.h"
32 #include "copy.h"
33 #include "data-fd-util.h"
34 #include "env-util.h"
35 #include "escape.h"
36 #include "exec-credential.h"
37 #include "exec-invoke.h"
38 #include "execute.h"
39 #include "exit-status.h"
40 #include "fd-util.h"
41 #include "hexdecoct.h"
42 #include "io-util.h"
43 #include "iovec-util.h"
44 #include "missing_ioprio.h"
45 #include "missing_prctl.h"
46 #include "missing_securebits.h"
47 #include "missing_syscall.h"
48 #include "mkdir-label.h"
49 #include "proc-cmdline.h"
50 #include "process-util.h"
51 #include "psi-util.h"
52 #include "rlimit-util.h"
53 #include "seccomp-util.h"
54 #include "selinux-util.h"
55 #include "signal-util.h"
56 #include "smack-util.h"
57 #include "socket-util.h"
58 #include "string-table.h"
59 #include "strv.h"
60 #include "terminal-util.h"
61 #include "utmp-wtmp.h"
62
63 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
64 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
65
66 #define SNDBUF_SIZE (8*1024*1024)
67
68 static int shift_fds(int fds[], size_t n_fds) {
69 if (n_fds <= 0)
70 return 0;
71
72 /* Modifies the fds array! (sorts it) */
73
74 assert(fds);
75
76 for (int start = 0;;) {
77 int restart_from = -1;
78
79 for (int i = start; i < (int) n_fds; i++) {
80 int nfd;
81
82 /* Already at right index? */
83 if (fds[i] == i+3)
84 continue;
85
86 nfd = fcntl(fds[i], F_DUPFD, i + 3);
87 if (nfd < 0)
88 return -errno;
89
90 safe_close(fds[i]);
91 fds[i] = nfd;
92
93 /* Hmm, the fd we wanted isn't free? Then
94 * let's remember that and try again from here */
95 if (nfd != i+3 && restart_from < 0)
96 restart_from = i;
97 }
98
99 if (restart_from < 0)
100 break;
101
102 start = restart_from;
103 }
104
105 return 0;
106 }
107
108 static int flag_fds(
109 const int fds[],
110 size_t n_socket_fds,
111 size_t n_fds,
112 bool nonblock) {
113
114 int r;
115
116 assert(fds || n_fds == 0);
117
118 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
119 * O_NONBLOCK only applies to socket activation though. */
120
121 for (size_t i = 0; i < n_fds; i++) {
122
123 if (i < n_socket_fds) {
124 r = fd_nonblock(fds[i], nonblock);
125 if (r < 0)
126 return r;
127 }
128
129 /* We unconditionally drop FD_CLOEXEC from the fds,
130 * since after all we want to pass these fds to our
131 * children */
132
133 r = fd_cloexec(fds[i], false);
134 if (r < 0)
135 return r;
136 }
137
138 return 0;
139 }
140
141 static bool is_terminal_input(ExecInput i) {
142 return IN_SET(i,
143 EXEC_INPUT_TTY,
144 EXEC_INPUT_TTY_FORCE,
145 EXEC_INPUT_TTY_FAIL);
146 }
147
148 static bool is_terminal_output(ExecOutput o) {
149 return IN_SET(o,
150 EXEC_OUTPUT_TTY,
151 EXEC_OUTPUT_KMSG_AND_CONSOLE,
152 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
153 }
154
155 static bool is_kmsg_output(ExecOutput o) {
156 return IN_SET(o,
157 EXEC_OUTPUT_KMSG,
158 EXEC_OUTPUT_KMSG_AND_CONSOLE);
159 }
160
161 static bool exec_context_needs_term(const ExecContext *c) {
162 assert(c);
163
164 /* Return true if the execution context suggests we should set $TERM to something useful. */
165
166 if (is_terminal_input(c->std_input))
167 return true;
168
169 if (is_terminal_output(c->std_output))
170 return true;
171
172 if (is_terminal_output(c->std_error))
173 return true;
174
175 return !!c->tty_path;
176 }
177
178 static int open_null_as(int flags, int nfd) {
179 int fd;
180
181 assert(nfd >= 0);
182
183 fd = open("/dev/null", flags|O_NOCTTY);
184 if (fd < 0)
185 return -errno;
186
187 return move_fd(fd, nfd, false);
188 }
189
190 static int connect_journal_socket(
191 int fd,
192 const char *log_namespace,
193 uid_t uid,
194 gid_t gid) {
195
196 uid_t olduid = UID_INVALID;
197 gid_t oldgid = GID_INVALID;
198 const char *j;
199 int r;
200
201 j = log_namespace ?
202 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
203 "/run/systemd/journal/stdout";
204
205 if (gid_is_valid(gid)) {
206 oldgid = getgid();
207
208 if (setegid(gid) < 0)
209 return -errno;
210 }
211
212 if (uid_is_valid(uid)) {
213 olduid = getuid();
214
215 if (seteuid(uid) < 0) {
216 r = -errno;
217 goto restore_gid;
218 }
219 }
220
221 r = connect_unix_path(fd, AT_FDCWD, j);
222
223 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
224 an LSM interferes. */
225
226 if (uid_is_valid(uid))
227 (void) seteuid(olduid);
228
229 restore_gid:
230 if (gid_is_valid(gid))
231 (void) setegid(oldgid);
232
233 return r;
234 }
235
236 static int connect_logger_as(
237 const ExecContext *context,
238 const ExecParameters *params,
239 ExecOutput output,
240 const char *ident,
241 int nfd,
242 uid_t uid,
243 gid_t gid) {
244
245 _cleanup_close_ int fd = -EBADF;
246 int r;
247
248 assert(context);
249 assert(params);
250 assert(output < _EXEC_OUTPUT_MAX);
251 assert(ident);
252 assert(nfd >= 0);
253
254 fd = socket(AF_UNIX, SOCK_STREAM, 0);
255 if (fd < 0)
256 return -errno;
257
258 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
259 if (r < 0)
260 return r;
261
262 if (shutdown(fd, SHUT_RD) < 0)
263 return -errno;
264
265 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
266
267 if (dprintf(fd,
268 "%s\n"
269 "%s\n"
270 "%i\n"
271 "%i\n"
272 "%i\n"
273 "%i\n"
274 "%i\n",
275 context->syslog_identifier ?: ident,
276 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
277 context->syslog_priority,
278 !!context->syslog_level_prefix,
279 false,
280 is_kmsg_output(output),
281 is_terminal_output(output)) < 0)
282 return -errno;
283
284 return move_fd(TAKE_FD(fd), nfd, false);
285 }
286
287 static int open_terminal_as(const char *path, int flags, int nfd) {
288 int fd;
289
290 assert(path);
291 assert(nfd >= 0);
292
293 fd = open_terminal(path, flags | O_NOCTTY);
294 if (fd < 0)
295 return fd;
296
297 return move_fd(fd, nfd, false);
298 }
299
300 static int acquire_path(const char *path, int flags, mode_t mode) {
301 _cleanup_close_ int fd = -EBADF;
302 int r;
303
304 assert(path);
305
306 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
307 flags |= O_CREAT;
308
309 fd = open(path, flags|O_NOCTTY, mode);
310 if (fd >= 0)
311 return TAKE_FD(fd);
312
313 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
314 return -errno;
315
316 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
317
318 fd = socket(AF_UNIX, SOCK_STREAM, 0);
319 if (fd < 0)
320 return -errno;
321
322 r = connect_unix_path(fd, AT_FDCWD, path);
323 if (IN_SET(r, -ENOTSOCK, -EINVAL))
324 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
325 * wasn't an AF_UNIX socket after all */
326 return -ENXIO;
327 if (r < 0)
328 return r;
329
330 if ((flags & O_ACCMODE) == O_RDONLY)
331 r = shutdown(fd, SHUT_WR);
332 else if ((flags & O_ACCMODE) == O_WRONLY)
333 r = shutdown(fd, SHUT_RD);
334 else
335 r = 0;
336 if (r < 0)
337 return -errno;
338
339 return TAKE_FD(fd);
340 }
341
342 static int fixup_input(
343 const ExecContext *context,
344 int socket_fd,
345 bool apply_tty_stdin) {
346
347 ExecInput std_input;
348
349 assert(context);
350
351 std_input = context->std_input;
352
353 if (is_terminal_input(std_input) && !apply_tty_stdin)
354 return EXEC_INPUT_NULL;
355
356 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
357 return EXEC_INPUT_NULL;
358
359 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
360 return EXEC_INPUT_NULL;
361
362 return std_input;
363 }
364
365 static int fixup_output(ExecOutput output, int socket_fd) {
366
367 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
368 return EXEC_OUTPUT_INHERIT;
369
370 return output;
371 }
372
373 static int setup_input(
374 const ExecContext *context,
375 const ExecParameters *params,
376 int socket_fd,
377 const int named_iofds[static 3]) {
378
379 ExecInput i;
380 int r;
381
382 assert(context);
383 assert(params);
384 assert(named_iofds);
385
386 if (params->stdin_fd >= 0) {
387 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
388 return -errno;
389
390 /* Try to make this the controlling tty, if it is a tty, and reset it */
391 if (isatty(STDIN_FILENO)) {
392 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
393
394 if (context->tty_reset)
395 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
396
397 (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
398 }
399
400 return STDIN_FILENO;
401 }
402
403 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
404
405 switch (i) {
406
407 case EXEC_INPUT_NULL:
408 return open_null_as(O_RDONLY, STDIN_FILENO);
409
410 case EXEC_INPUT_TTY:
411 case EXEC_INPUT_TTY_FORCE:
412 case EXEC_INPUT_TTY_FAIL: {
413 _cleanup_close_ int tty_fd = -EBADF;
414 const char *tty_path;
415
416 tty_path = ASSERT_PTR(exec_context_tty_path(context));
417
418 tty_fd = acquire_terminal(tty_path,
419 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
420 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
421 ACQUIRE_TERMINAL_WAIT,
422 USEC_INFINITY);
423 if (tty_fd < 0)
424 return tty_fd;
425
426 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
427 if (r < 0)
428 return r;
429
430 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
431 if (r < 0)
432 return r;
433
434 TAKE_FD(tty_fd);
435 return r;
436 }
437
438 case EXEC_INPUT_SOCKET:
439 assert(socket_fd >= 0);
440
441 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
442
443 case EXEC_INPUT_NAMED_FD:
444 assert(named_iofds[STDIN_FILENO] >= 0);
445
446 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
447 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
448
449 case EXEC_INPUT_DATA: {
450 int fd;
451
452 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
453 if (fd < 0)
454 return fd;
455
456 return move_fd(fd, STDIN_FILENO, false);
457 }
458
459 case EXEC_INPUT_FILE: {
460 bool rw;
461 int fd;
462
463 assert(context->stdio_file[STDIN_FILENO]);
464
465 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
466 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
467
468 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
469 if (fd < 0)
470 return fd;
471
472 return move_fd(fd, STDIN_FILENO, false);
473 }
474
475 default:
476 assert_not_reached();
477 }
478 }
479
480 static bool can_inherit_stderr_from_stdout(
481 const ExecContext *context,
482 ExecOutput o,
483 ExecOutput e) {
484
485 assert(context);
486
487 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
488 * stderr fd */
489
490 if (e == EXEC_OUTPUT_INHERIT)
491 return true;
492 if (e != o)
493 return false;
494
495 if (e == EXEC_OUTPUT_NAMED_FD)
496 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
497
498 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
499 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
500
501 return true;
502 }
503
504 static int setup_output(
505 const ExecContext *context,
506 const ExecParameters *params,
507 int fileno,
508 int socket_fd,
509 const int named_iofds[static 3],
510 const char *ident,
511 uid_t uid,
512 gid_t gid,
513 dev_t *journal_stream_dev,
514 ino_t *journal_stream_ino) {
515
516 ExecOutput o;
517 ExecInput i;
518 int r;
519
520 assert(context);
521 assert(params);
522 assert(ident);
523 assert(journal_stream_dev);
524 assert(journal_stream_ino);
525
526 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
527
528 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
529 return -errno;
530
531 return STDOUT_FILENO;
532 }
533
534 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
535 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
536 return -errno;
537
538 return STDERR_FILENO;
539 }
540
541 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
542 o = fixup_output(context->std_output, socket_fd);
543
544 if (fileno == STDERR_FILENO) {
545 ExecOutput e;
546 e = fixup_output(context->std_error, socket_fd);
547
548 /* This expects the input and output are already set up */
549
550 /* Don't change the stderr file descriptor if we inherit all
551 * the way and are not on a tty */
552 if (e == EXEC_OUTPUT_INHERIT &&
553 o == EXEC_OUTPUT_INHERIT &&
554 i == EXEC_INPUT_NULL &&
555 !is_terminal_input(context->std_input) &&
556 getppid() != 1)
557 return fileno;
558
559 /* Duplicate from stdout if possible */
560 if (can_inherit_stderr_from_stdout(context, o, e))
561 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
562
563 o = e;
564
565 } else if (o == EXEC_OUTPUT_INHERIT) {
566 /* If input got downgraded, inherit the original value */
567 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
568 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
569
570 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
571 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
572 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
573
574 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
575 if (getppid() != 1)
576 return fileno;
577
578 /* We need to open /dev/null here anew, to get the right access mode. */
579 return open_null_as(O_WRONLY, fileno);
580 }
581
582 switch (o) {
583
584 case EXEC_OUTPUT_NULL:
585 return open_null_as(O_WRONLY, fileno);
586
587 case EXEC_OUTPUT_TTY:
588 if (is_terminal_input(i))
589 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
590
591 /* We don't reset the terminal if this is just about output */
592 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
593
594 case EXEC_OUTPUT_KMSG:
595 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
596 case EXEC_OUTPUT_JOURNAL:
597 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
598 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
599 if (r < 0) {
600 log_exec_warning_errno(context,
601 params,
602 r,
603 "Failed to connect %s to the journal socket, ignoring: %m",
604 fileno == STDOUT_FILENO ? "stdout" : "stderr");
605 r = open_null_as(O_WRONLY, fileno);
606 } else {
607 struct stat st;
608
609 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
610 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
611 * services to detect whether they are connected to the journal or not.
612 *
613 * If both stdout and stderr are connected to a stream then let's make sure to store the data
614 * about STDERR as that's usually the best way to do logging. */
615
616 if (fstat(fileno, &st) >= 0 &&
617 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
618 *journal_stream_dev = st.st_dev;
619 *journal_stream_ino = st.st_ino;
620 }
621 }
622 return r;
623
624 case EXEC_OUTPUT_SOCKET:
625 assert(socket_fd >= 0);
626
627 return RET_NERRNO(dup2(socket_fd, fileno));
628
629 case EXEC_OUTPUT_NAMED_FD:
630 assert(named_iofds[fileno] >= 0);
631
632 (void) fd_nonblock(named_iofds[fileno], false);
633 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
634
635 case EXEC_OUTPUT_FILE:
636 case EXEC_OUTPUT_FILE_APPEND:
637 case EXEC_OUTPUT_FILE_TRUNCATE: {
638 bool rw;
639 int fd, flags;
640
641 assert(context->stdio_file[fileno]);
642
643 rw = context->std_input == EXEC_INPUT_FILE &&
644 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
645
646 if (rw)
647 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
648
649 flags = O_WRONLY;
650 if (o == EXEC_OUTPUT_FILE_APPEND)
651 flags |= O_APPEND;
652 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
653 flags |= O_TRUNC;
654
655 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
656 if (fd < 0)
657 return fd;
658
659 return move_fd(fd, fileno, 0);
660 }
661
662 default:
663 assert_not_reached();
664 }
665 }
666
667 static int chown_terminal(int fd, uid_t uid) {
668 int r;
669
670 assert(fd >= 0);
671
672 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
673 if (!isatty_safe(fd))
674 return 0;
675
676 /* This might fail. What matters are the results. */
677 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
678 if (r < 0)
679 return r;
680
681 return 1;
682 }
683
684 static int setup_confirm_stdio(
685 const ExecContext *context,
686 const char *vc,
687 int *ret_saved_stdin,
688 int *ret_saved_stdout) {
689
690 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
691 int r;
692
693 assert(ret_saved_stdin);
694 assert(ret_saved_stdout);
695
696 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
697 if (saved_stdin < 0)
698 return -errno;
699
700 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
701 if (saved_stdout < 0)
702 return -errno;
703
704 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
705 if (fd < 0)
706 return fd;
707
708 r = chown_terminal(fd, getuid());
709 if (r < 0)
710 return r;
711
712 r = reset_terminal_fd(fd, /* switch_to_text= */ true);
713 if (r < 0)
714 return r;
715
716 r = exec_context_apply_tty_size(context, fd, vc);
717 if (r < 0)
718 return r;
719
720 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
721 TAKE_FD(fd);
722 if (r < 0)
723 return r;
724
725 *ret_saved_stdin = TAKE_FD(saved_stdin);
726 *ret_saved_stdout = TAKE_FD(saved_stdout);
727 return 0;
728 }
729
730 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
731 assert(err < 0);
732 assert(unit_id);
733
734 if (err == -ETIMEDOUT)
735 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
736 else {
737 errno = -err;
738 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
739 }
740 }
741
742 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
743 _cleanup_close_ int fd = -EBADF;
744
745 assert(vc);
746
747 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
748 if (fd < 0)
749 return;
750
751 write_confirm_error_fd(err, fd, unit_id);
752 }
753
754 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
755 int r = 0;
756
757 assert(saved_stdin);
758 assert(saved_stdout);
759
760 release_terminal();
761
762 if (*saved_stdin >= 0)
763 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
764 r = -errno;
765
766 if (*saved_stdout >= 0)
767 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
768 r = -errno;
769
770 *saved_stdin = safe_close(*saved_stdin);
771 *saved_stdout = safe_close(*saved_stdout);
772
773 return r;
774 }
775
776 enum {
777 CONFIRM_PRETEND_FAILURE = -1,
778 CONFIRM_PRETEND_SUCCESS = 0,
779 CONFIRM_EXECUTE = 1,
780 };
781
782 static bool confirm_spawn_disabled(void) {
783 return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
784 }
785
786 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
787 int saved_stdout = -1, saved_stdin = -1, r;
788 _cleanup_free_ char *e = NULL;
789 char c;
790
791 assert(context);
792 assert(params);
793
794 /* For any internal errors, assume a positive response. */
795 r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
796 if (r < 0) {
797 write_confirm_error(r, params->confirm_spawn, params->unit_id);
798 return CONFIRM_EXECUTE;
799 }
800
801 /* confirm_spawn might have been disabled while we were sleeping. */
802 if (!params->confirm_spawn || confirm_spawn_disabled()) {
803 r = 1;
804 goto restore_stdio;
805 }
806
807 e = ellipsize(cmdline, 60, 100);
808 if (!e) {
809 log_oom();
810 r = CONFIRM_EXECUTE;
811 goto restore_stdio;
812 }
813
814 for (;;) {
815 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
816 if (r < 0) {
817 write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
818 r = CONFIRM_EXECUTE;
819 goto restore_stdio;
820 }
821
822 switch (c) {
823 case 'c':
824 printf("Resuming normal execution.\n");
825 manager_disable_confirm_spawn();
826 r = 1;
827 break;
828 case 'D':
829 printf(" Unit: %s\n",
830 params->unit_id);
831 exec_context_dump(context, stdout, " ");
832 exec_params_dump(params, stdout, " ");
833 continue; /* ask again */
834 case 'f':
835 printf("Failing execution.\n");
836 r = CONFIRM_PRETEND_FAILURE;
837 break;
838 case 'h':
839 printf(" c - continue, proceed without asking anymore\n"
840 " D - dump, show the state of the unit\n"
841 " f - fail, don't execute the command and pretend it failed\n"
842 " h - help\n"
843 " i - info, show a short summary of the unit\n"
844 " j - jobs, show jobs that are in progress\n"
845 " s - skip, don't execute the command and pretend it succeeded\n"
846 " y - yes, execute the command\n");
847 continue; /* ask again */
848 case 'i':
849 printf(" Unit: %s\n"
850 " Command: %s\n",
851 params->unit_id, cmdline);
852 continue; /* ask again */
853 case 'j':
854 if (sigqueue(getppid(),
855 SIGRTMIN+18,
856 (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
857 return -errno;
858
859 continue; /* ask again */
860 case 'n':
861 /* 'n' was removed in favor of 'f'. */
862 printf("Didn't understand 'n', did you mean 'f'?\n");
863 continue; /* ask again */
864 case 's':
865 printf("Skipping execution.\n");
866 r = CONFIRM_PRETEND_SUCCESS;
867 break;
868 case 'y':
869 r = CONFIRM_EXECUTE;
870 break;
871 default:
872 assert_not_reached();
873 }
874 break;
875 }
876
877 restore_stdio:
878 restore_confirm_stdio(&saved_stdin, &saved_stdout);
879 return r;
880 }
881
882 static int get_fixed_user(
883 const char *user_or_uid,
884 const char **ret_username,
885 uid_t *ret_uid,
886 gid_t *ret_gid,
887 const char **ret_home,
888 const char **ret_shell) {
889
890 int r;
891
892 assert(user_or_uid);
893 assert(ret_username);
894
895 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
896 * (i.e. are "/" or "/bin/nologin"). */
897
898 r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
899 if (r < 0)
900 return r;
901
902 /* user_or_uid is normalized by get_user_creds to username */
903 *ret_username = user_or_uid;
904
905 return 0;
906 }
907
908 static int get_fixed_group(
909 const char *group_or_gid,
910 const char **ret_groupname,
911 gid_t *ret_gid) {
912
913 int r;
914
915 assert(group_or_gid);
916 assert(ret_groupname);
917
918 r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
919 if (r < 0)
920 return r;
921
922 /* group_or_gid is normalized by get_group_creds to groupname */
923 *ret_groupname = group_or_gid;
924
925 return 0;
926 }
927
928 static int get_supplementary_groups(const ExecContext *c, const char *user,
929 const char *group, gid_t gid,
930 gid_t **supplementary_gids, int *ngids) {
931 int r, k = 0;
932 int ngroups_max;
933 bool keep_groups = false;
934 gid_t *groups = NULL;
935 _cleanup_free_ gid_t *l_gids = NULL;
936
937 assert(c);
938
939 /*
940 * If user is given, then lookup GID and supplementary groups list.
941 * We avoid NSS lookups for gid=0. Also we have to initialize groups
942 * here and as early as possible so we keep the list of supplementary
943 * groups of the caller.
944 */
945 if (user && gid_is_valid(gid) && gid != 0) {
946 /* First step, initialize groups from /etc/groups */
947 if (initgroups(user, gid) < 0)
948 return -errno;
949
950 keep_groups = true;
951 }
952
953 if (strv_isempty(c->supplementary_groups))
954 return 0;
955
956 /*
957 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
958 * be positive, otherwise fail.
959 */
960 errno = 0;
961 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
962 if (ngroups_max <= 0)
963 return errno_or_else(EOPNOTSUPP);
964
965 l_gids = new(gid_t, ngroups_max);
966 if (!l_gids)
967 return -ENOMEM;
968
969 if (keep_groups) {
970 /*
971 * Lookup the list of groups that the user belongs to, we
972 * avoid NSS lookups here too for gid=0.
973 */
974 k = ngroups_max;
975 if (getgrouplist(user, gid, l_gids, &k) < 0)
976 return -EINVAL;
977 } else
978 k = 0;
979
980 STRV_FOREACH(i, c->supplementary_groups) {
981 const char *g;
982
983 if (k >= ngroups_max)
984 return -E2BIG;
985
986 g = *i;
987 r = get_group_creds(&g, l_gids+k, 0);
988 if (r < 0)
989 return r;
990
991 k++;
992 }
993
994 /*
995 * Sets ngids to zero to drop all supplementary groups, happens
996 * when we are under root and SupplementaryGroups= is empty.
997 */
998 if (k == 0) {
999 *ngids = 0;
1000 return 0;
1001 }
1002
1003 /* Otherwise get the final list of supplementary groups */
1004 groups = memdup(l_gids, sizeof(gid_t) * k);
1005 if (!groups)
1006 return -ENOMEM;
1007
1008 *supplementary_gids = groups;
1009 *ngids = k;
1010
1011 groups = NULL;
1012
1013 return 0;
1014 }
1015
1016 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1017 int r;
1018
1019 /* Handle SupplementaryGroups= if it is not empty */
1020 if (ngids > 0) {
1021 r = maybe_setgroups(ngids, supplementary_gids);
1022 if (r < 0)
1023 return r;
1024 }
1025
1026 if (gid_is_valid(gid)) {
1027 /* Then set our gids */
1028 if (setresgid(gid, gid, gid) < 0)
1029 return -errno;
1030 }
1031
1032 return 0;
1033 }
1034
1035 static int set_securebits(unsigned bits, unsigned mask) {
1036 unsigned applied;
1037 int current;
1038
1039 current = prctl(PR_GET_SECUREBITS);
1040 if (current < 0)
1041 return -errno;
1042
1043 /* Clear all securebits defined in mask and set bits */
1044 applied = ((unsigned) current & ~mask) | bits;
1045 if ((unsigned) current == applied)
1046 return 0;
1047
1048 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1049 return -errno;
1050
1051 return 1;
1052 }
1053
1054 static int enforce_user(
1055 const ExecContext *context,
1056 uid_t uid,
1057 uint64_t capability_ambient_set) {
1058 assert(context);
1059 int r;
1060
1061 if (!uid_is_valid(uid))
1062 return 0;
1063
1064 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1065 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1066 * case. */
1067
1068 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1069
1070 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1071 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1072 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1073 if (r < 0)
1074 return r;
1075 }
1076
1077 /* Second step: actually set the uids */
1078 if (setresuid(uid, uid, uid) < 0)
1079 return -errno;
1080
1081 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1082 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1083 * outside of this call. */
1084 return 0;
1085 }
1086
1087 #if HAVE_PAM
1088
1089 static int null_conv(
1090 int num_msg,
1091 const struct pam_message **msg,
1092 struct pam_response **resp,
1093 void *appdata_ptr) {
1094
1095 /* We don't support conversations */
1096
1097 return PAM_CONV_ERR;
1098 }
1099
1100 #endif
1101
1102 static int setup_pam(
1103 const char *name,
1104 const char *user,
1105 uid_t uid,
1106 gid_t gid,
1107 const char *tty,
1108 char ***env, /* updated on success */
1109 const int fds[], size_t n_fds) {
1110
1111 #if HAVE_PAM
1112
1113 static const struct pam_conv conv = {
1114 .conv = null_conv,
1115 .appdata_ptr = NULL
1116 };
1117
1118 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1119 _cleanup_strv_free_ char **e = NULL;
1120 pam_handle_t *handle = NULL;
1121 sigset_t old_ss;
1122 int pam_code = PAM_SUCCESS, r;
1123 bool close_session = false;
1124 pid_t pam_pid = 0, parent_pid;
1125 int flags = 0;
1126
1127 assert(name);
1128 assert(user);
1129 assert(env);
1130
1131 /* We set up PAM in the parent process, then fork. The child
1132 * will then stay around until killed via PR_GET_PDEATHSIG or
1133 * systemd via the cgroup logic. It will then remove the PAM
1134 * session again. The parent process will exec() the actual
1135 * daemon. We do things this way to ensure that the main PID
1136 * of the daemon is the one we initially fork()ed. */
1137
1138 r = barrier_create(&barrier);
1139 if (r < 0)
1140 goto fail;
1141
1142 if (log_get_max_level() < LOG_DEBUG)
1143 flags |= PAM_SILENT;
1144
1145 pam_code = pam_start(name, user, &conv, &handle);
1146 if (pam_code != PAM_SUCCESS) {
1147 handle = NULL;
1148 goto fail;
1149 }
1150
1151 if (!tty) {
1152 _cleanup_free_ char *q = NULL;
1153
1154 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1155 * out if that's the case, and read the TTY off it. */
1156
1157 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1158 tty = strjoina("/dev/", q);
1159 }
1160
1161 if (tty) {
1162 pam_code = pam_set_item(handle, PAM_TTY, tty);
1163 if (pam_code != PAM_SUCCESS)
1164 goto fail;
1165 }
1166
1167 STRV_FOREACH(nv, *env) {
1168 pam_code = pam_putenv(handle, *nv);
1169 if (pam_code != PAM_SUCCESS)
1170 goto fail;
1171 }
1172
1173 pam_code = pam_acct_mgmt(handle, flags);
1174 if (pam_code != PAM_SUCCESS)
1175 goto fail;
1176
1177 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1178 if (pam_code != PAM_SUCCESS)
1179 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1180
1181 pam_code = pam_open_session(handle, flags);
1182 if (pam_code != PAM_SUCCESS)
1183 goto fail;
1184
1185 close_session = true;
1186
1187 e = pam_getenvlist(handle);
1188 if (!e) {
1189 pam_code = PAM_BUF_ERR;
1190 goto fail;
1191 }
1192
1193 /* Block SIGTERM, so that we know that it won't get lost in the child */
1194
1195 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1196
1197 parent_pid = getpid_cached();
1198
1199 r = safe_fork("(sd-pam)", 0, &pam_pid);
1200 if (r < 0)
1201 goto fail;
1202 if (r == 0) {
1203 int ret = EXIT_PAM;
1204
1205 /* The child's job is to reset the PAM session on termination */
1206 barrier_set_role(&barrier, BARRIER_CHILD);
1207
1208 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1209 * those fds are open here that have been opened by PAM. */
1210 (void) close_many(fds, n_fds);
1211
1212 /* Drop privileges - we don't need any to pam_close_session and this will make
1213 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1214 * threads to fail to exit normally */
1215
1216 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1217 if (r < 0)
1218 log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1219
1220 (void) ignore_signals(SIGPIPE);
1221
1222 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1223 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1224 * this way. We rely on the control groups kill logic to do the rest for us. */
1225 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1226 goto child_finish;
1227
1228 /* Tell the parent that our setup is done. This is especially important regarding dropping
1229 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1230 *
1231 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1232 (void) barrier_place(&barrier);
1233
1234 /* Check if our parent process might already have died? */
1235 if (getppid() == parent_pid) {
1236 sigset_t ss;
1237 int sig;
1238
1239 assert_se(sigemptyset(&ss) >= 0);
1240 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1241
1242 assert_se(sigwait(&ss, &sig) == 0);
1243 assert(sig == SIGTERM);
1244 }
1245
1246 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1247 if (pam_code != PAM_SUCCESS)
1248 goto child_finish;
1249
1250 /* If our parent died we'll end the session */
1251 if (getppid() != parent_pid) {
1252 pam_code = pam_close_session(handle, flags);
1253 if (pam_code != PAM_SUCCESS)
1254 goto child_finish;
1255 }
1256
1257 ret = 0;
1258
1259 child_finish:
1260 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1261 * know about this. See pam_end(3) */
1262 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1263 _exit(ret);
1264 }
1265
1266 barrier_set_role(&barrier, BARRIER_PARENT);
1267
1268 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1269 * here. */
1270 handle = NULL;
1271
1272 /* Unblock SIGTERM again in the parent */
1273 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1274
1275 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1276 * this fd around. */
1277 closelog();
1278
1279 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1280 * recover. However, warn loudly if it happens. */
1281 if (!barrier_place_and_sync(&barrier))
1282 log_error("PAM initialization failed");
1283
1284 return strv_free_and_replace(*env, e);
1285
1286 fail:
1287 if (pam_code != PAM_SUCCESS) {
1288 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1289 r = -EPERM; /* PAM errors do not map to errno */
1290 } else
1291 log_error_errno(r, "PAM failed: %m");
1292
1293 if (handle) {
1294 if (close_session)
1295 pam_code = pam_close_session(handle, flags);
1296
1297 (void) pam_end(handle, pam_code | flags);
1298 }
1299
1300 closelog();
1301 return r;
1302 #else
1303 return 0;
1304 #endif
1305 }
1306
1307 static void rename_process_from_path(const char *path) {
1308 _cleanup_free_ char *buf = NULL;
1309 const char *p;
1310
1311 assert(path);
1312
1313 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1314 * /bin/ps */
1315
1316 if (path_extract_filename(path, &buf) < 0) {
1317 rename_process("(...)");
1318 return;
1319 }
1320
1321 size_t l = strlen(buf);
1322 if (l > 8) {
1323 /* The end of the process name is usually more interesting, since the first bit might just be
1324 * "systemd-" */
1325 p = buf + l - 8;
1326 l = 8;
1327 } else
1328 p = buf;
1329
1330 char process_name[11];
1331 process_name[0] = '(';
1332 memcpy(process_name+1, p, l);
1333 process_name[1+l] = ')';
1334 process_name[1+l+1] = 0;
1335
1336 (void) rename_process(process_name);
1337 }
1338
1339 static bool context_has_address_families(const ExecContext *c) {
1340 assert(c);
1341
1342 return c->address_families_allow_list ||
1343 !set_isempty(c->address_families);
1344 }
1345
1346 static bool context_has_syscall_filters(const ExecContext *c) {
1347 assert(c);
1348
1349 return c->syscall_allow_list ||
1350 !hashmap_isempty(c->syscall_filter);
1351 }
1352
1353 static bool context_has_syscall_logs(const ExecContext *c) {
1354 assert(c);
1355
1356 return c->syscall_log_allow_list ||
1357 !hashmap_isempty(c->syscall_log);
1358 }
1359
1360 static bool context_has_seccomp(const ExecContext *c) {
1361 /* We need NNP if we have any form of seccomp and are unprivileged */
1362 return c->lock_personality ||
1363 c->memory_deny_write_execute ||
1364 c->private_devices ||
1365 c->protect_clock ||
1366 c->protect_hostname ||
1367 c->protect_kernel_tunables ||
1368 c->protect_kernel_modules ||
1369 c->protect_kernel_logs ||
1370 context_has_address_families(c) ||
1371 exec_context_restrict_namespaces_set(c) ||
1372 c->restrict_realtime ||
1373 c->restrict_suid_sgid ||
1374 !set_isempty(c->syscall_archs) ||
1375 context_has_syscall_filters(c) ||
1376 context_has_syscall_logs(c);
1377 }
1378
1379 static bool context_has_no_new_privileges(const ExecContext *c) {
1380 assert(c);
1381
1382 if (c->no_new_privileges)
1383 return true;
1384
1385 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1386 return false;
1387
1388 return context_has_seccomp(c);
1389 }
1390
1391 #if HAVE_SECCOMP
1392
1393 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1394 void *id, *val;
1395 bool has_capget = false, has_capset = false, has_prctl = false;
1396
1397 assert(c);
1398
1399 /* No syscall filter, we are allowed to drop privileges */
1400 if (hashmap_isempty(c->syscall_filter))
1401 return true;
1402
1403 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1404 _cleanup_free_ char *name = NULL;
1405
1406 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1407
1408 if (streq(name, "capget"))
1409 has_capget = true;
1410 else if (streq(name, "capset"))
1411 has_capset = true;
1412 else if (streq(name, "prctl"))
1413 has_prctl = true;
1414 }
1415
1416 if (c->syscall_allow_list)
1417 return has_capget && has_capset && has_prctl;
1418 else
1419 return !(has_capget || has_capset || has_prctl);
1420 }
1421
1422 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1423
1424 if (is_seccomp_available())
1425 return false;
1426
1427 log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1428 return true;
1429 }
1430
1431 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1432 uint32_t negative_action, default_action, action;
1433 int r;
1434
1435 assert(c);
1436 assert(p);
1437
1438 if (!context_has_syscall_filters(c))
1439 return 0;
1440
1441 if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1442 return 0;
1443
1444 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1445
1446 if (c->syscall_allow_list) {
1447 default_action = negative_action;
1448 action = SCMP_ACT_ALLOW;
1449 } else {
1450 default_action = SCMP_ACT_ALLOW;
1451 action = negative_action;
1452 }
1453
1454 if (needs_ambient_hack) {
1455 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1456 if (r < 0)
1457 return r;
1458 }
1459
1460 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1461 }
1462
1463 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1464 #ifdef SCMP_ACT_LOG
1465 uint32_t default_action, action;
1466 #endif
1467
1468 assert(c);
1469 assert(p);
1470
1471 if (!context_has_syscall_logs(c))
1472 return 0;
1473
1474 #ifdef SCMP_ACT_LOG
1475 if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1476 return 0;
1477
1478 if (c->syscall_log_allow_list) {
1479 /* Log nothing but the ones listed */
1480 default_action = SCMP_ACT_ALLOW;
1481 action = SCMP_ACT_LOG;
1482 } else {
1483 /* Log everything but the ones listed */
1484 default_action = SCMP_ACT_LOG;
1485 action = SCMP_ACT_ALLOW;
1486 }
1487
1488 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1489 #else
1490 /* old libseccomp */
1491 log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1492 return 0;
1493 #endif
1494 }
1495
1496 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1497 assert(c);
1498 assert(p);
1499
1500 if (set_isempty(c->syscall_archs))
1501 return 0;
1502
1503 if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1504 return 0;
1505
1506 return seccomp_restrict_archs(c->syscall_archs);
1507 }
1508
1509 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1510 assert(c);
1511 assert(p);
1512
1513 if (!context_has_address_families(c))
1514 return 0;
1515
1516 if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1517 return 0;
1518
1519 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1520 }
1521
1522 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1523 int r;
1524
1525 assert(c);
1526 assert(p);
1527
1528 if (!c->memory_deny_write_execute)
1529 return 0;
1530
1531 /* use prctl() if kernel supports it (6.3) */
1532 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1533 if (r == 0) {
1534 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1535 return 0;
1536 }
1537 if (r < 0 && errno != EINVAL)
1538 return log_exec_debug_errno(c,
1539 p,
1540 errno,
1541 "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1542 /* else use seccomp */
1543 log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1544
1545 if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1546 return 0;
1547
1548 return seccomp_memory_deny_write_execute();
1549 }
1550
1551 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1552 assert(c);
1553 assert(p);
1554
1555 if (!c->restrict_realtime)
1556 return 0;
1557
1558 if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1559 return 0;
1560
1561 return seccomp_restrict_realtime();
1562 }
1563
1564 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1565 assert(c);
1566 assert(p);
1567
1568 if (!c->restrict_suid_sgid)
1569 return 0;
1570
1571 if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1572 return 0;
1573
1574 return seccomp_restrict_suid_sgid();
1575 }
1576
1577 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1578 assert(c);
1579 assert(p);
1580
1581 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1582 * let's protect even those systems where this is left on in the kernel. */
1583
1584 if (!c->protect_kernel_tunables)
1585 return 0;
1586
1587 if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1588 return 0;
1589
1590 return seccomp_protect_sysctl();
1591 }
1592
1593 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1594 assert(c);
1595 assert(p);
1596
1597 /* Turn off module syscalls on ProtectKernelModules=yes */
1598
1599 if (!c->protect_kernel_modules)
1600 return 0;
1601
1602 if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1603 return 0;
1604
1605 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1606 }
1607
1608 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1609 assert(c);
1610 assert(p);
1611
1612 if (!c->protect_kernel_logs)
1613 return 0;
1614
1615 if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1616 return 0;
1617
1618 return seccomp_protect_syslog();
1619 }
1620
1621 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1622 assert(c);
1623 assert(p);
1624
1625 if (!c->protect_clock)
1626 return 0;
1627
1628 if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1629 return 0;
1630
1631 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1632 }
1633
1634 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1635 assert(c);
1636 assert(p);
1637
1638 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1639
1640 if (!c->private_devices)
1641 return 0;
1642
1643 if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1644 return 0;
1645
1646 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1647 }
1648
1649 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1650 assert(c);
1651 assert(p);
1652
1653 if (!exec_context_restrict_namespaces_set(c))
1654 return 0;
1655
1656 if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1657 return 0;
1658
1659 return seccomp_restrict_namespaces(c->restrict_namespaces);
1660 }
1661
1662 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1663 unsigned long personality;
1664 int r;
1665
1666 assert(c);
1667 assert(p);
1668
1669 if (!c->lock_personality)
1670 return 0;
1671
1672 if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1673 return 0;
1674
1675 personality = c->personality;
1676
1677 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1678 if (personality == PERSONALITY_INVALID) {
1679
1680 r = opinionated_personality(&personality);
1681 if (r < 0)
1682 return r;
1683 }
1684
1685 return seccomp_lock_personality(personality);
1686 }
1687
1688 #endif
1689
1690 #if HAVE_LIBBPF
1691 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1692 int r;
1693
1694 assert(c);
1695 assert(p);
1696
1697 if (!exec_context_restrict_filesystems_set(c))
1698 return 0;
1699
1700 if (p->bpf_outer_map_fd < 0) {
1701 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1702 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1703 return 0;
1704 }
1705
1706 /* We are in a new binary, so dl-open again */
1707 r = dlopen_bpf();
1708 if (r < 0)
1709 return r;
1710
1711 return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
1712 }
1713 #endif
1714
1715 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1716 assert(c);
1717 assert(p);
1718
1719 if (!c->protect_hostname)
1720 return 0;
1721
1722 if (ns_type_supported(NAMESPACE_UTS)) {
1723 if (unshare(CLONE_NEWUTS) < 0) {
1724 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1725 *ret_exit_status = EXIT_NAMESPACE;
1726 return log_exec_error_errno(c,
1727 p,
1728 errno,
1729 "Failed to set up UTS namespacing: %m");
1730 }
1731
1732 log_exec_warning(c,
1733 p,
1734 "ProtectHostname=yes is configured, but UTS namespace setup is "
1735 "prohibited (container manager?), ignoring namespace setup.");
1736 }
1737 } else
1738 log_exec_warning(c,
1739 p,
1740 "ProtectHostname=yes is configured, but the kernel does not "
1741 "support UTS namespaces, ignoring namespace setup.");
1742
1743 #if HAVE_SECCOMP
1744 int r;
1745
1746 if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1747 return 0;
1748
1749 r = seccomp_protect_hostname();
1750 if (r < 0) {
1751 *ret_exit_status = EXIT_SECCOMP;
1752 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1753 }
1754 #endif
1755
1756 return 0;
1757 }
1758
1759 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1760 assert(idle_pipe);
1761
1762 idle_pipe[1] = safe_close(idle_pipe[1]);
1763 idle_pipe[2] = safe_close(idle_pipe[2]);
1764
1765 if (idle_pipe[0] >= 0) {
1766 int r;
1767
1768 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1769
1770 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1771 ssize_t n;
1772
1773 /* Signal systemd that we are bored and want to continue. */
1774 n = write(idle_pipe[3], "x", 1);
1775 if (n > 0)
1776 /* Wait for systemd to react to the signal above. */
1777 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1778 }
1779
1780 idle_pipe[0] = safe_close(idle_pipe[0]);
1781
1782 }
1783
1784 idle_pipe[3] = safe_close(idle_pipe[3]);
1785 }
1786
1787 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1788
1789 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1790 * the service payload in. */
1791 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1792 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1793 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1794 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1795 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1796 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1797 };
1798
1799 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1800
1801 static int build_environment(
1802 const ExecContext *c,
1803 const ExecParameters *p,
1804 const CGroupContext *cgroup_context,
1805 size_t n_fds,
1806 const char *home,
1807 const char *username,
1808 const char *shell,
1809 dev_t journal_stream_dev,
1810 ino_t journal_stream_ino,
1811 const char *memory_pressure_path,
1812 char ***ret) {
1813
1814 _cleanup_strv_free_ char **our_env = NULL;
1815 size_t n_env = 0;
1816 char *x;
1817 int r;
1818
1819 assert(c);
1820 assert(p);
1821 assert(ret);
1822
1823 #define N_ENV_VARS 19
1824 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1825 if (!our_env)
1826 return -ENOMEM;
1827
1828 if (n_fds > 0) {
1829 _cleanup_free_ char *joined = NULL;
1830
1831 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1832 return -ENOMEM;
1833 our_env[n_env++] = x;
1834
1835 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1836 return -ENOMEM;
1837 our_env[n_env++] = x;
1838
1839 joined = strv_join(p->fd_names, ":");
1840 if (!joined)
1841 return -ENOMEM;
1842
1843 x = strjoin("LISTEN_FDNAMES=", joined);
1844 if (!x)
1845 return -ENOMEM;
1846 our_env[n_env++] = x;
1847 }
1848
1849 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1850 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1851 return -ENOMEM;
1852 our_env[n_env++] = x;
1853
1854 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1855 return -ENOMEM;
1856 our_env[n_env++] = x;
1857 }
1858
1859 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1860 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1861 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1862 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1863 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1864 if (!x)
1865 return -ENOMEM;
1866 our_env[n_env++] = x;
1867 }
1868
1869 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1870 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1871 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1872 * SetLoginEnvironment= switch. */
1873 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1874 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1875 if (r < 0)
1876 return log_exec_debug_errno(c,
1877 p,
1878 r,
1879 "Failed to determine user credentials for root: %m");
1880 }
1881
1882 bool set_user_login_env = exec_context_get_set_login_environment(c);
1883
1884 if (username) {
1885 x = strjoin("USER=", username);
1886 if (!x)
1887 return -ENOMEM;
1888 our_env[n_env++] = x;
1889
1890 if (set_user_login_env) {
1891 x = strjoin("LOGNAME=", username);
1892 if (!x)
1893 return -ENOMEM;
1894 our_env[n_env++] = x;
1895 }
1896 }
1897
1898 if (home && set_user_login_env) {
1899 x = strjoin("HOME=", home);
1900 if (!x)
1901 return -ENOMEM;
1902
1903 path_simplify(x + 5);
1904 our_env[n_env++] = x;
1905 }
1906
1907 if (shell && set_user_login_env) {
1908 x = strjoin("SHELL=", shell);
1909 if (!x)
1910 return -ENOMEM;
1911
1912 path_simplify(x + 6);
1913 our_env[n_env++] = x;
1914 }
1915
1916 if (!sd_id128_is_null(p->invocation_id)) {
1917 assert(p->invocation_id_string);
1918
1919 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1920 if (!x)
1921 return -ENOMEM;
1922
1923 our_env[n_env++] = x;
1924 }
1925
1926 if (exec_context_needs_term(c)) {
1927 _cleanup_free_ char *cmdline = NULL;
1928 const char *tty_path, *term = NULL;
1929
1930 tty_path = exec_context_tty_path(c);
1931
1932 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1933 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1934 * container manager passes to PID 1 ends up all the way in the console login shown. */
1935
1936 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1937 term = getenv("TERM");
1938 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1939 _cleanup_free_ char *key = NULL;
1940
1941 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1942 if (!key)
1943 return -ENOMEM;
1944
1945 r = proc_cmdline_get_key(key, 0, &cmdline);
1946 if (r < 0)
1947 log_exec_debug_errno(c,
1948 p,
1949 r,
1950 "Failed to read %s from kernel cmdline, ignoring: %m",
1951 key);
1952 else if (r > 0)
1953 term = cmdline;
1954 }
1955
1956 if (!term)
1957 term = default_term_for_tty(tty_path);
1958
1959 x = strjoin("TERM=", term);
1960 if (!x)
1961 return -ENOMEM;
1962 our_env[n_env++] = x;
1963 }
1964
1965 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1966 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1967 return -ENOMEM;
1968
1969 our_env[n_env++] = x;
1970 }
1971
1972 if (c->log_namespace) {
1973 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1974 if (!x)
1975 return -ENOMEM;
1976
1977 our_env[n_env++] = x;
1978 }
1979
1980 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1981 _cleanup_free_ char *joined = NULL;
1982 const char *n;
1983
1984 if (!p->prefix[t])
1985 continue;
1986
1987 if (c->directories[t].n_items == 0)
1988 continue;
1989
1990 n = exec_directory_env_name_to_string(t);
1991 if (!n)
1992 continue;
1993
1994 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1995 _cleanup_free_ char *prefixed = NULL;
1996
1997 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1998 if (!prefixed)
1999 return -ENOMEM;
2000
2001 if (!strextend_with_separator(&joined, ":", prefixed))
2002 return -ENOMEM;
2003 }
2004
2005 x = strjoin(n, "=", joined);
2006 if (!x)
2007 return -ENOMEM;
2008
2009 our_env[n_env++] = x;
2010 }
2011
2012 _cleanup_free_ char *creds_dir = NULL;
2013 r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
2014 if (r < 0)
2015 return r;
2016 if (r > 0) {
2017 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2018 if (!x)
2019 return -ENOMEM;
2020
2021 our_env[n_env++] = x;
2022 }
2023
2024 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2025 return -ENOMEM;
2026
2027 our_env[n_env++] = x;
2028
2029 if (memory_pressure_path) {
2030 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2031 if (!x)
2032 return -ENOMEM;
2033
2034 our_env[n_env++] = x;
2035
2036 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2037 _cleanup_free_ char *b = NULL, *e = NULL;
2038
2039 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2040 MEMORY_PRESSURE_DEFAULT_TYPE,
2041 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2042 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2043 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2044 return -ENOMEM;
2045
2046 if (base64mem(b, strlen(b) + 1, &e) < 0)
2047 return -ENOMEM;
2048
2049 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2050 if (!x)
2051 return -ENOMEM;
2052
2053 our_env[n_env++] = x;
2054 }
2055 }
2056
2057 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2058 #undef N_ENV_VARS
2059
2060 *ret = TAKE_PTR(our_env);
2061
2062 return 0;
2063 }
2064
2065 static int build_pass_environment(const ExecContext *c, char ***ret) {
2066 _cleanup_strv_free_ char **pass_env = NULL;
2067 size_t n_env = 0;
2068
2069 STRV_FOREACH(i, c->pass_environment) {
2070 _cleanup_free_ char *x = NULL;
2071 char *v;
2072
2073 v = getenv(*i);
2074 if (!v)
2075 continue;
2076 x = strjoin(*i, "=", v);
2077 if (!x)
2078 return -ENOMEM;
2079
2080 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2081 return -ENOMEM;
2082
2083 pass_env[n_env++] = TAKE_PTR(x);
2084 pass_env[n_env] = NULL;
2085 }
2086
2087 *ret = TAKE_PTR(pass_env);
2088
2089 return 0;
2090 }
2091
2092 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2093 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2094 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2095 _cleanup_close_ int unshare_ready_fd = -EBADF;
2096 _cleanup_(sigkill_waitp) pid_t pid = 0;
2097 uint64_t c = 1;
2098 ssize_t n;
2099 int r;
2100
2101 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2102 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2103 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2104 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2105 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2106 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2107 * continues execution normally.
2108 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2109 * does not need CAP_SETUID to write the single line mapping to itself. */
2110
2111 /* Can only set up multiple mappings with CAP_SETUID. */
2112 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2113 r = asprintf(&uid_map,
2114 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2115 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2116 ouid, ouid, uid, uid);
2117 else
2118 r = asprintf(&uid_map,
2119 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2120 ouid, ouid);
2121
2122 if (r < 0)
2123 return -ENOMEM;
2124
2125 /* Can only set up multiple mappings with CAP_SETGID. */
2126 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2127 r = asprintf(&gid_map,
2128 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2129 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2130 ogid, ogid, gid, gid);
2131 else
2132 r = asprintf(&gid_map,
2133 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2134 ogid, ogid);
2135
2136 if (r < 0)
2137 return -ENOMEM;
2138
2139 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2140 * namespace. */
2141 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2142 if (unshare_ready_fd < 0)
2143 return -errno;
2144
2145 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2146 * failed. */
2147 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2148 return -errno;
2149
2150 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2151 if (r < 0)
2152 return r;
2153 if (r == 0) {
2154 _cleanup_close_ int fd = -EBADF;
2155 const char *a;
2156 pid_t ppid;
2157
2158 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2159 * here, after the parent opened its own user namespace. */
2160
2161 ppid = getppid();
2162 errno_pipe[0] = safe_close(errno_pipe[0]);
2163
2164 /* Wait until the parent unshared the user namespace */
2165 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2166 r = -errno;
2167 goto child_fail;
2168 }
2169
2170 /* Disable the setgroups() system call in the child user namespace, for good. */
2171 a = procfs_file_alloca(ppid, "setgroups");
2172 fd = open(a, O_WRONLY|O_CLOEXEC);
2173 if (fd < 0) {
2174 if (errno != ENOENT) {
2175 r = -errno;
2176 goto child_fail;
2177 }
2178
2179 /* If the file is missing the kernel is too old, let's continue anyway. */
2180 } else {
2181 if (write(fd, "deny\n", 5) < 0) {
2182 r = -errno;
2183 goto child_fail;
2184 }
2185
2186 fd = safe_close(fd);
2187 }
2188
2189 /* First write the GID map */
2190 a = procfs_file_alloca(ppid, "gid_map");
2191 fd = open(a, O_WRONLY|O_CLOEXEC);
2192 if (fd < 0) {
2193 r = -errno;
2194 goto child_fail;
2195 }
2196 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2197 r = -errno;
2198 goto child_fail;
2199 }
2200 fd = safe_close(fd);
2201
2202 /* The write the UID map */
2203 a = procfs_file_alloca(ppid, "uid_map");
2204 fd = open(a, O_WRONLY|O_CLOEXEC);
2205 if (fd < 0) {
2206 r = -errno;
2207 goto child_fail;
2208 }
2209 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2210 r = -errno;
2211 goto child_fail;
2212 }
2213
2214 _exit(EXIT_SUCCESS);
2215
2216 child_fail:
2217 (void) write(errno_pipe[1], &r, sizeof(r));
2218 _exit(EXIT_FAILURE);
2219 }
2220
2221 errno_pipe[1] = safe_close(errno_pipe[1]);
2222
2223 if (unshare(CLONE_NEWUSER) < 0)
2224 return -errno;
2225
2226 /* Let the child know that the namespace is ready now */
2227 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2228 return -errno;
2229
2230 /* Try to read an error code from the child */
2231 n = read(errno_pipe[0], &r, sizeof(r));
2232 if (n < 0)
2233 return -errno;
2234 if (n == sizeof(r)) { /* an error code was sent to us */
2235 if (r < 0)
2236 return r;
2237 return -EIO;
2238 }
2239 if (n != 0) /* on success we should have read 0 bytes */
2240 return -EIO;
2241
2242 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2243 if (r < 0)
2244 return r;
2245 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2246 return -EIO;
2247
2248 return 0;
2249 }
2250
2251 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2252 _cleanup_free_ char *src_abs = NULL;
2253 int r;
2254
2255 assert(source);
2256
2257 src_abs = path_join(root, source);
2258 if (!src_abs)
2259 return -ENOMEM;
2260
2261 STRV_FOREACH(dst, symlinks) {
2262 _cleanup_free_ char *dst_abs = NULL;
2263
2264 dst_abs = path_join(root, *dst);
2265 if (!dst_abs)
2266 return -ENOMEM;
2267
2268 r = mkdir_parents_label(dst_abs, 0755);
2269 if (r < 0)
2270 return r;
2271
2272 r = symlink_idempotent(src_abs, dst_abs, true);
2273 if (r < 0)
2274 return r;
2275 }
2276
2277 return 0;
2278 }
2279
2280 static int setup_exec_directory(
2281 const ExecContext *context,
2282 const ExecParameters *params,
2283 uid_t uid,
2284 gid_t gid,
2285 ExecDirectoryType type,
2286 bool needs_mount_namespace,
2287 int *exit_status) {
2288
2289 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2290 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2291 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2292 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2293 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2294 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2295 };
2296 int r;
2297
2298 assert(context);
2299 assert(params);
2300 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2301 assert(exit_status);
2302
2303 if (!params->prefix[type])
2304 return 0;
2305
2306 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2307 if (!uid_is_valid(uid))
2308 uid = 0;
2309 if (!gid_is_valid(gid))
2310 gid = 0;
2311 }
2312
2313 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2314 _cleanup_free_ char *p = NULL, *pp = NULL;
2315
2316 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2317 if (!p) {
2318 r = -ENOMEM;
2319 goto fail;
2320 }
2321
2322 r = mkdir_parents_label(p, 0755);
2323 if (r < 0)
2324 goto fail;
2325
2326 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2327
2328 /* If we are in user mode, and a configuration directory exists but a state directory
2329 * doesn't exist, then we likely are upgrading from an older systemd version that
2330 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2331 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2332 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2333 * separated. If a service has both dirs configured but only the configuration dir
2334 * exists and the state dir does not, we assume we are looking at an update
2335 * situation. Hence, create a compatibility symlink, so that all expectations are
2336 * met.
2337 *
2338 * (We also do something similar with the log directory, which still doesn't exist in
2339 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2340
2341 /* this assumes the state dir is always created before the configuration dir */
2342 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2343 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2344
2345 r = laccess(p, F_OK);
2346 if (r == -ENOENT) {
2347 _cleanup_free_ char *q = NULL;
2348
2349 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2350 * under the configuration hierarchy. */
2351
2352 if (type == EXEC_DIRECTORY_STATE)
2353 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2354 else if (type == EXEC_DIRECTORY_LOGS)
2355 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2356 else
2357 assert_not_reached();
2358 if (!q) {
2359 r = -ENOMEM;
2360 goto fail;
2361 }
2362
2363 r = laccess(q, F_OK);
2364 if (r >= 0) {
2365 /* It does exist! This hence looks like an update. Symlink the
2366 * configuration directory into the state directory. */
2367
2368 r = symlink_idempotent(q, p, /* make_relative= */ true);
2369 if (r < 0)
2370 goto fail;
2371
2372 log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2373 continue;
2374 } else if (r != -ENOENT)
2375 log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2376
2377 } else if (r < 0)
2378 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2379 }
2380
2381 if (exec_directory_is_private(context, type)) {
2382 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2383 * case we want to avoid leaving a directory around fully accessible that is owned by
2384 * a dynamic user whose UID is later on reused. To lock this down we use the same
2385 * trick used by container managers to prohibit host users to get access to files of
2386 * the same UID in containers: we place everything inside a directory that has an
2387 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2388 * for unprivileged host code. We then use fs namespacing to make this directory
2389 * permeable for the service itself.
2390 *
2391 * Specifically: for a service which wants a special directory "foo/" we first create
2392 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2393 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2394 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2395 * unprivileged host users can't look into it. Inside of the namespace of the unit
2396 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2397 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2398 * for the service and making sure it only gets access to the dirs it needs but no
2399 * others. Tricky? Yes, absolutely, but it works!
2400 *
2401 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2402 * to be owned by the service itself.
2403 *
2404 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2405 * for sharing files or sockets with other services. */
2406
2407 pp = path_join(params->prefix[type], "private");
2408 if (!pp) {
2409 r = -ENOMEM;
2410 goto fail;
2411 }
2412
2413 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2414 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2415 if (r < 0)
2416 goto fail;
2417
2418 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2419 r = -ENOMEM;
2420 goto fail;
2421 }
2422
2423 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2424 r = mkdir_parents_label(pp, 0755);
2425 if (r < 0)
2426 goto fail;
2427
2428 if (is_dir(p, false) > 0 &&
2429 (laccess(pp, F_OK) == -ENOENT)) {
2430
2431 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2432 * it over. Most likely the service has been upgraded from one that didn't use
2433 * DynamicUser=1, to one that does. */
2434
2435 log_exec_info(context,
2436 params,
2437 "Found pre-existing public %s= directory %s, migrating to %s.\n"
2438 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2439 exec_directory_type_to_string(type), p, pp);
2440
2441 r = RET_NERRNO(rename(p, pp));
2442 if (r < 0)
2443 goto fail;
2444 } else {
2445 /* Otherwise, create the actual directory for the service */
2446
2447 r = mkdir_label(pp, context->directories[type].mode);
2448 if (r < 0 && r != -EEXIST)
2449 goto fail;
2450 }
2451
2452 if (!context->directories[type].items[i].only_create) {
2453 /* And link it up from the original place.
2454 * Notes
2455 * 1) If a mount namespace is going to be used, then this symlink remains on
2456 * the host, and a new one for the child namespace will be created later.
2457 * 2) It is not necessary to create this symlink when one of its parent
2458 * directories is specified and already created. E.g.
2459 * StateDirectory=foo foo/bar
2460 * In that case, the inode points to pp and p for "foo/bar" are the same:
2461 * pp = "/var/lib/private/foo/bar"
2462 * p = "/var/lib/foo/bar"
2463 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2464 * we do not need to create the symlink, but we cannot create the symlink.
2465 * See issue #24783. */
2466 r = symlink_idempotent(pp, p, true);
2467 if (r < 0)
2468 goto fail;
2469 }
2470
2471 } else {
2472 _cleanup_free_ char *target = NULL;
2473
2474 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2475 readlink_and_make_absolute(p, &target) >= 0) {
2476 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2477
2478 /* This already exists and is a symlink? Interesting. Maybe it's one created
2479 * by DynamicUser=1 (see above)?
2480 *
2481 * We do this for all directory types except for ConfigurationDirectory=,
2482 * since they all support the private/ symlink logic at least in some
2483 * configurations, see above. */
2484
2485 r = chase(target, NULL, 0, &target_resolved, NULL);
2486 if (r < 0)
2487 goto fail;
2488
2489 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2490 if (!q) {
2491 r = -ENOMEM;
2492 goto fail;
2493 }
2494
2495 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2496 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2497 if (r < 0)
2498 goto fail;
2499
2500 if (path_equal(q_resolved, target_resolved)) {
2501
2502 /* Hmm, apparently DynamicUser= was once turned on for this service,
2503 * but is no longer. Let's move the directory back up. */
2504
2505 log_exec_info(context,
2506 params,
2507 "Found pre-existing private %s= directory %s, migrating to %s.\n"
2508 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2509 exec_directory_type_to_string(type), q, p);
2510
2511 r = RET_NERRNO(unlink(p));
2512 if (r < 0)
2513 goto fail;
2514
2515 r = RET_NERRNO(rename(q, p));
2516 if (r < 0)
2517 goto fail;
2518 }
2519 }
2520
2521 r = mkdir_label(p, context->directories[type].mode);
2522 if (r < 0) {
2523 if (r != -EEXIST)
2524 goto fail;
2525
2526 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2527 struct stat st;
2528
2529 /* Don't change the owner/access mode of the configuration directory,
2530 * as in the common case it is not written to by a service, and shall
2531 * not be writable. */
2532
2533 r = RET_NERRNO(stat(p, &st));
2534 if (r < 0)
2535 goto fail;
2536
2537 /* Still complain if the access mode doesn't match */
2538 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2539 log_exec_warning(context,
2540 params,
2541 "%s \'%s\' already exists but the mode is different. "
2542 "(File system: %o %sMode: %o)",
2543 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2544 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2545
2546 continue;
2547 }
2548 }
2549 }
2550
2551 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2552 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2553 * current UID/GID ownership.) */
2554 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2555 if (r < 0)
2556 goto fail;
2557
2558 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2559 * available to user code anyway */
2560 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2561 continue;
2562
2563 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2564 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2565 * assignments to exist. */
2566 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2567 if (r < 0)
2568 goto fail;
2569 }
2570
2571 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2572 * they are set up later, to allow configuring empty var/run/etc. */
2573 if (!needs_mount_namespace)
2574 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2575 r = create_many_symlinks(params->prefix[type],
2576 context->directories[type].items[i].path,
2577 context->directories[type].items[i].symlinks);
2578 if (r < 0)
2579 goto fail;
2580 }
2581
2582 return 0;
2583
2584 fail:
2585 *exit_status = exit_status_table[type];
2586 return r;
2587 }
2588
2589 #if ENABLE_SMACK
2590 static int setup_smack(
2591 const ExecParameters *params,
2592 const ExecContext *context,
2593 int executable_fd) {
2594 int r;
2595
2596 assert(params);
2597 assert(executable_fd >= 0);
2598
2599 if (context->smack_process_label) {
2600 r = mac_smack_apply_pid(0, context->smack_process_label);
2601 if (r < 0)
2602 return r;
2603 } else if (params->fallback_smack_process_label) {
2604 _cleanup_free_ char *exec_label = NULL;
2605
2606 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2607 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2608 return r;
2609
2610 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2611 if (r < 0)
2612 return r;
2613 }
2614
2615 return 0;
2616 }
2617 #endif
2618
2619 static int compile_bind_mounts(
2620 const ExecContext *context,
2621 const ExecParameters *params,
2622 BindMount **ret_bind_mounts,
2623 size_t *ret_n_bind_mounts,
2624 char ***ret_empty_directories) {
2625
2626 _cleanup_strv_free_ char **empty_directories = NULL;
2627 BindMount *bind_mounts = NULL;
2628 size_t n, h = 0;
2629 int r;
2630
2631 assert(context);
2632 assert(params);
2633 assert(ret_bind_mounts);
2634 assert(ret_n_bind_mounts);
2635 assert(ret_empty_directories);
2636
2637 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2638
2639 n = context->n_bind_mounts;
2640 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2641 if (!params->prefix[t])
2642 continue;
2643
2644 for (size_t i = 0; i < context->directories[t].n_items; i++)
2645 n += !context->directories[t].items[i].only_create;
2646 }
2647
2648 if (n <= 0) {
2649 *ret_bind_mounts = NULL;
2650 *ret_n_bind_mounts = 0;
2651 *ret_empty_directories = NULL;
2652 return 0;
2653 }
2654
2655 bind_mounts = new(BindMount, n);
2656 if (!bind_mounts)
2657 return -ENOMEM;
2658
2659 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2660 BindMount *item = context->bind_mounts + i;
2661 _cleanup_free_ char *s = NULL, *d = NULL;
2662
2663 s = strdup(item->source);
2664 if (!s)
2665 return -ENOMEM;
2666
2667 d = strdup(item->destination);
2668 if (!d)
2669 return -ENOMEM;
2670
2671 bind_mounts[h++] = (BindMount) {
2672 .source = TAKE_PTR(s),
2673 .destination = TAKE_PTR(d),
2674 .read_only = item->read_only,
2675 .recursive = item->recursive,
2676 .ignore_enoent = item->ignore_enoent,
2677 };
2678 }
2679
2680 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2681 if (!params->prefix[t])
2682 continue;
2683
2684 if (context->directories[t].n_items == 0)
2685 continue;
2686
2687 if (exec_directory_is_private(context, t) &&
2688 !exec_context_with_rootfs(context)) {
2689 char *private_root;
2690
2691 /* So this is for a dynamic user, and we need to make sure the process can access its own
2692 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2693 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2694
2695 private_root = path_join(params->prefix[t], "private");
2696 if (!private_root)
2697 return -ENOMEM;
2698
2699 r = strv_consume(&empty_directories, private_root);
2700 if (r < 0)
2701 return r;
2702 }
2703
2704 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2705 _cleanup_free_ char *s = NULL, *d = NULL;
2706
2707 /* When one of the parent directories is in the list, we cannot create the symlink
2708 * for the child directory. See also the comments in setup_exec_directory(). */
2709 if (context->directories[t].items[i].only_create)
2710 continue;
2711
2712 if (exec_directory_is_private(context, t))
2713 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2714 else
2715 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2716 if (!s)
2717 return -ENOMEM;
2718
2719 if (exec_directory_is_private(context, t) &&
2720 exec_context_with_rootfs(context))
2721 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2722 * directory is not created on the root directory. So, let's bind-mount the directory
2723 * on the 'non-private' place. */
2724 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2725 else
2726 d = strdup(s);
2727 if (!d)
2728 return -ENOMEM;
2729
2730 bind_mounts[h++] = (BindMount) {
2731 .source = TAKE_PTR(s),
2732 .destination = TAKE_PTR(d),
2733 .read_only = false,
2734 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2735 .recursive = true,
2736 .ignore_enoent = false,
2737 };
2738 }
2739 }
2740
2741 assert(h == n);
2742
2743 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2744 *ret_n_bind_mounts = n;
2745 *ret_empty_directories = TAKE_PTR(empty_directories);
2746
2747 return (int) n;
2748 }
2749
2750 /* ret_symlinks will contain a list of pairs src:dest that describes
2751 * the symlinks to create later on. For example, the symlinks needed
2752 * to safely give private directories to DynamicUser=1 users. */
2753 static int compile_symlinks(
2754 const ExecContext *context,
2755 const ExecParameters *params,
2756 bool setup_os_release_symlink,
2757 char ***ret_symlinks) {
2758
2759 _cleanup_strv_free_ char **symlinks = NULL;
2760 int r;
2761
2762 assert(context);
2763 assert(params);
2764 assert(ret_symlinks);
2765
2766 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2767 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2768 _cleanup_free_ char *private_path = NULL, *path = NULL;
2769
2770 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2771 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2772
2773 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2774 dst_abs = path_join(params->prefix[dt], *symlink);
2775 if (!src_abs || !dst_abs)
2776 return -ENOMEM;
2777
2778 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2779 if (r < 0)
2780 return r;
2781 }
2782
2783 if (!exec_directory_is_private(context, dt) ||
2784 exec_context_with_rootfs(context) ||
2785 context->directories[dt].items[i].only_create)
2786 continue;
2787
2788 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2789 if (!private_path)
2790 return -ENOMEM;
2791
2792 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2793 if (!path)
2794 return -ENOMEM;
2795
2796 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2797 if (r < 0)
2798 return r;
2799 }
2800 }
2801
2802 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2803 * and readers will never get a half-written version. Note that, while the paths specified here are
2804 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2805 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2806 if (setup_os_release_symlink) {
2807 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2808 if (r < 0)
2809 return r;
2810
2811 r = strv_extend(&symlinks, "/run/host/os-release");
2812 if (r < 0)
2813 return r;
2814 }
2815
2816 *ret_symlinks = TAKE_PTR(symlinks);
2817
2818 return 0;
2819 }
2820
2821 static bool insist_on_sandboxing(
2822 const ExecContext *context,
2823 const char *root_dir,
2824 const char *root_image,
2825 const BindMount *bind_mounts,
2826 size_t n_bind_mounts) {
2827
2828 assert(context);
2829 assert(n_bind_mounts == 0 || bind_mounts);
2830
2831 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2832 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2833 * rearrange stuff in a way we cannot ignore gracefully. */
2834
2835 if (context->n_temporary_filesystems > 0)
2836 return true;
2837
2838 if (root_dir || root_image)
2839 return true;
2840
2841 if (context->n_mount_images > 0)
2842 return true;
2843
2844 if (context->dynamic_user)
2845 return true;
2846
2847 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2848 return true;
2849
2850 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2851 * essential. */
2852 for (size_t i = 0; i < n_bind_mounts; i++)
2853 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2854 return true;
2855
2856 if (context->log_namespace)
2857 return true;
2858
2859 return false;
2860 }
2861
2862 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
2863 _cleanup_close_ int fd = -EBADF;
2864 int r;
2865
2866 if (!runtime || !runtime->ephemeral_copy)
2867 return 0;
2868
2869 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2870 if (r < 0)
2871 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2872
2873 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2874
2875 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2876 if (fd >= 0)
2877 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2878 return 0;
2879
2880 if (fd != -EAGAIN)
2881 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2882
2883 log_debug("Making ephemeral snapshot of %s to %s",
2884 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
2885
2886 if (context->root_image)
2887 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
2888 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
2889 else
2890 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
2891 AT_FDCWD, runtime->ephemeral_copy,
2892 BTRFS_SNAPSHOT_FALLBACK_COPY |
2893 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2894 BTRFS_SNAPSHOT_RECURSIVE |
2895 BTRFS_SNAPSHOT_LOCK_BSD);
2896 if (fd < 0)
2897 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
2898 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
2899
2900 if (context->root_image) {
2901 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2902 * which tends to not perform well in combination with lots of random writes.
2903 *
2904 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2905 * copy, but we at least want to make the intention clear.
2906 */
2907 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2908 if (r < 0)
2909 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
2910 }
2911
2912 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2913 if (r < 0)
2914 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2915
2916 return 1;
2917 }
2918
2919 static int verity_settings_prepare(
2920 VeritySettings *verity,
2921 const char *root_image,
2922 const void *root_hash,
2923 size_t root_hash_size,
2924 const char *root_hash_path,
2925 const void *root_hash_sig,
2926 size_t root_hash_sig_size,
2927 const char *root_hash_sig_path,
2928 const char *verity_data_path) {
2929
2930 int r;
2931
2932 assert(verity);
2933
2934 if (root_hash) {
2935 void *d;
2936
2937 d = memdup(root_hash, root_hash_size);
2938 if (!d)
2939 return -ENOMEM;
2940
2941 free_and_replace(verity->root_hash, d);
2942 verity->root_hash_size = root_hash_size;
2943 verity->designator = PARTITION_ROOT;
2944 }
2945
2946 if (root_hash_sig) {
2947 void *d;
2948
2949 d = memdup(root_hash_sig, root_hash_sig_size);
2950 if (!d)
2951 return -ENOMEM;
2952
2953 free_and_replace(verity->root_hash_sig, d);
2954 verity->root_hash_sig_size = root_hash_sig_size;
2955 verity->designator = PARTITION_ROOT;
2956 }
2957
2958 if (verity_data_path) {
2959 r = free_and_strdup(&verity->data_path, verity_data_path);
2960 if (r < 0)
2961 return r;
2962 }
2963
2964 r = verity_settings_load(
2965 verity,
2966 root_image,
2967 root_hash_path,
2968 root_hash_sig_path);
2969 if (r < 0)
2970 return log_debug_errno(r, "Failed to load root hash: %m");
2971
2972 return 0;
2973 }
2974
2975 static int apply_mount_namespace(
2976 ExecCommandFlags command_flags,
2977 const ExecContext *context,
2978 const ExecParameters *params,
2979 ExecRuntime *runtime,
2980 const char *memory_pressure_path,
2981 char **error_path) {
2982
2983 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
2984 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
2985 **read_write_paths_cleanup = NULL;
2986 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
2987 *extension_dir = NULL, *host_os_release_stage = NULL;
2988 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
2989 char **read_write_paths;
2990 bool needs_sandboxing, setup_os_release_symlink;
2991 BindMount *bind_mounts = NULL;
2992 size_t n_bind_mounts = 0;
2993 int r;
2994
2995 assert(context);
2996
2997 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
2998
2999 if (params->flags & EXEC_APPLY_CHROOT) {
3000 r = setup_ephemeral(context, runtime);
3001 if (r < 0)
3002 return r;
3003
3004 if (context->root_image)
3005 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3006 else
3007 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3008 }
3009
3010 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3011 if (r < 0)
3012 return r;
3013
3014 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3015 * service will need to write to it in order to start the notifications. */
3016 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3017 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3018 if (!read_write_paths_cleanup)
3019 return -ENOMEM;
3020
3021 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3022 if (r < 0)
3023 return r;
3024
3025 read_write_paths = read_write_paths_cleanup;
3026 } else
3027 read_write_paths = context->read_write_paths;
3028
3029 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3030 if (needs_sandboxing) {
3031 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3032 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3033 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3034
3035 if (context->private_tmp && runtime && runtime->shared) {
3036 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3037 tmp_dir = runtime->shared->tmp_dir;
3038 else if (runtime->shared->tmp_dir)
3039 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3040
3041 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3042 var_tmp_dir = runtime->shared->var_tmp_dir;
3043 else if (runtime->shared->var_tmp_dir)
3044 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3045 }
3046 }
3047
3048 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3049 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3050 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3051 if (r < 0)
3052 return r;
3053
3054 if (context->mount_propagation_flag == MS_SHARED)
3055 log_exec_debug(context,
3056 params,
3057 "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3058
3059 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3060 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3061 if (r < 0)
3062 return r;
3063 }
3064
3065 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3066 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3067 if (!propagate_dir)
3068 return -ENOMEM;
3069
3070 incoming_dir = strdup("/run/systemd/incoming");
3071 if (!incoming_dir)
3072 return -ENOMEM;
3073
3074 extension_dir = strdup("/run/systemd/unit-extensions");
3075 if (!extension_dir)
3076 return -ENOMEM;
3077
3078 /* If running under a different root filesystem, propagate the host's os-release. We make a
3079 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3080 if (setup_os_release_symlink) {
3081 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3082 if (!host_os_release_stage)
3083 return -ENOMEM;
3084 }
3085 } else {
3086 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3087
3088 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3089 return -ENOMEM;
3090
3091 if (setup_os_release_symlink) {
3092 if (asprintf(&host_os_release_stage,
3093 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3094 geteuid()) < 0)
3095 return -ENOMEM;
3096 }
3097 }
3098
3099 if (root_image) {
3100 r = verity_settings_prepare(
3101 &verity,
3102 root_image,
3103 context->root_hash, context->root_hash_size, context->root_hash_path,
3104 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3105 context->root_verity);
3106 if (r < 0)
3107 return r;
3108 }
3109
3110 NamespaceParameters parameters = {
3111 .runtime_scope = params->runtime_scope,
3112
3113 .root_directory = root_dir,
3114 .root_image = root_image,
3115 .root_image_options = context->root_image_options,
3116 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3117
3118 .read_write_paths = read_write_paths,
3119 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3120 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3121
3122 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3123 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3124
3125 .empty_directories = empty_directories,
3126 .symlinks = symlinks,
3127
3128 .bind_mounts = bind_mounts,
3129 .n_bind_mounts = n_bind_mounts,
3130
3131 .temporary_filesystems = context->temporary_filesystems,
3132 .n_temporary_filesystems = context->n_temporary_filesystems,
3133
3134 .mount_images = context->mount_images,
3135 .n_mount_images = context->n_mount_images,
3136 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3137
3138 .tmp_dir = tmp_dir,
3139 .var_tmp_dir = var_tmp_dir,
3140
3141 .creds_path = creds_path,
3142 .log_namespace = context->log_namespace,
3143 .mount_propagation_flag = context->mount_propagation_flag,
3144
3145 .verity = &verity,
3146
3147 .extension_images = context->extension_images,
3148 .n_extension_images = context->n_extension_images,
3149 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3150 .extension_directories = context->extension_directories,
3151
3152 .propagate_dir = propagate_dir,
3153 .incoming_dir = incoming_dir,
3154 .extension_dir = extension_dir,
3155 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3156 .host_os_release_stage = host_os_release_stage,
3157
3158 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3159 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3160 * sandbox inside the mount namespace. */
3161 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3162
3163 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3164 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3165 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3166 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3167 .protect_hostname = needs_sandboxing && context->protect_hostname,
3168
3169 .private_dev = needs_sandboxing && context->private_devices,
3170 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3171 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3172
3173 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3174
3175 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3176 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3177
3178 .protect_home = needs_sandboxing ? context->protect_home : false,
3179 .protect_system = needs_sandboxing ? context->protect_system : false,
3180 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3181 .proc_subset = needs_sandboxing ? context->proc_subset : false,
3182 };
3183
3184 r = setup_namespace(&parameters, error_path);
3185 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3186 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3187 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3188 * completely different execution environment. */
3189 if (r == -ENOANO) {
3190 if (insist_on_sandboxing(
3191 context,
3192 root_dir, root_image,
3193 bind_mounts,
3194 n_bind_mounts))
3195 return log_exec_debug_errno(context,
3196 params,
3197 SYNTHETIC_ERRNO(EOPNOTSUPP),
3198 "Failed to set up namespace, and refusing to continue since "
3199 "the selected namespacing options alter mount environment non-trivially.\n"
3200 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3201 n_bind_mounts,
3202 context->n_temporary_filesystems,
3203 yes_no(root_dir),
3204 yes_no(root_image),
3205 yes_no(context->dynamic_user));
3206
3207 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3208 return 0;
3209 }
3210
3211 return r;
3212 }
3213
3214 static int apply_working_directory(
3215 const ExecContext *context,
3216 const ExecParameters *params,
3217 ExecRuntime *runtime,
3218 const char *home,
3219 int *exit_status) {
3220
3221 const char *d, *wd;
3222
3223 assert(context);
3224 assert(exit_status);
3225
3226 if (context->working_directory_home) {
3227
3228 if (!home) {
3229 *exit_status = EXIT_CHDIR;
3230 return -ENXIO;
3231 }
3232
3233 wd = home;
3234
3235 } else
3236 wd = empty_to_root(context->working_directory);
3237
3238 if (params->flags & EXEC_APPLY_CHROOT)
3239 d = wd;
3240 else
3241 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3242
3243 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3244 *exit_status = EXIT_CHDIR;
3245 return -errno;
3246 }
3247
3248 return 0;
3249 }
3250
3251 static int apply_root_directory(
3252 const ExecContext *context,
3253 const ExecParameters *params,
3254 ExecRuntime *runtime,
3255 const bool needs_mount_ns,
3256 int *exit_status) {
3257
3258 assert(context);
3259 assert(exit_status);
3260
3261 if (params->flags & EXEC_APPLY_CHROOT)
3262 if (!needs_mount_ns && context->root_directory)
3263 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3264 *exit_status = EXIT_CHROOT;
3265 return -errno;
3266 }
3267
3268 return 0;
3269 }
3270
3271 static int setup_keyring(
3272 const ExecContext *context,
3273 const ExecParameters *p,
3274 uid_t uid, gid_t gid) {
3275
3276 key_serial_t keyring;
3277 int r = 0;
3278 uid_t saved_uid;
3279 gid_t saved_gid;
3280
3281 assert(context);
3282 assert(p);
3283
3284 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3285 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3286 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3287 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3288 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3289 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3290
3291 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3292 return 0;
3293
3294 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3295 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3296 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3297 * & group is just as nasty as acquiring a reference to the user keyring. */
3298
3299 saved_uid = getuid();
3300 saved_gid = getgid();
3301
3302 if (gid_is_valid(gid) && gid != saved_gid) {
3303 if (setregid(gid, -1) < 0)
3304 return log_exec_error_errno(context,
3305 p,
3306 errno,
3307 "Failed to change GID for user keyring: %m");
3308 }
3309
3310 if (uid_is_valid(uid) && uid != saved_uid) {
3311 if (setreuid(uid, -1) < 0) {
3312 r = log_exec_error_errno(context,
3313 p,
3314 errno,
3315 "Failed to change UID for user keyring: %m");
3316 goto out;
3317 }
3318 }
3319
3320 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3321 if (keyring == -1) {
3322 if (errno == ENOSYS)
3323 log_exec_debug_errno(context,
3324 p,
3325 errno,
3326 "Kernel keyring not supported, ignoring.");
3327 else if (ERRNO_IS_PRIVILEGE(errno))
3328 log_exec_debug_errno(context,
3329 p,
3330 errno,
3331 "Kernel keyring access prohibited, ignoring.");
3332 else if (errno == EDQUOT)
3333 log_exec_debug_errno(context,
3334 p,
3335 errno,
3336 "Out of kernel keyrings to allocate, ignoring.");
3337 else
3338 r = log_exec_error_errno(context,
3339 p,
3340 errno,
3341 "Setting up kernel keyring failed: %m");
3342
3343 goto out;
3344 }
3345
3346 /* When requested link the user keyring into the session keyring. */
3347 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3348
3349 if (keyctl(KEYCTL_LINK,
3350 KEY_SPEC_USER_KEYRING,
3351 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3352 r = log_exec_error_errno(context,
3353 p,
3354 errno,
3355 "Failed to link user keyring into session keyring: %m");
3356 goto out;
3357 }
3358 }
3359
3360 /* Restore uid/gid back */
3361 if (uid_is_valid(uid) && uid != saved_uid) {
3362 if (setreuid(saved_uid, -1) < 0) {
3363 r = log_exec_error_errno(context,
3364 p,
3365 errno,
3366 "Failed to change UID back for user keyring: %m");
3367 goto out;
3368 }
3369 }
3370
3371 if (gid_is_valid(gid) && gid != saved_gid) {
3372 if (setregid(saved_gid, -1) < 0)
3373 return log_exec_error_errno(context,
3374 p,
3375 errno,
3376 "Failed to change GID back for user keyring: %m");
3377 }
3378
3379 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3380 if (!sd_id128_is_null(p->invocation_id)) {
3381 key_serial_t key;
3382
3383 key = add_key("user",
3384 "invocation_id",
3385 &p->invocation_id,
3386 sizeof(p->invocation_id),
3387 KEY_SPEC_SESSION_KEYRING);
3388 if (key == -1)
3389 log_exec_debug_errno(context,
3390 p,
3391 errno,
3392 "Failed to add invocation ID to keyring, ignoring: %m");
3393 else {
3394 if (keyctl(KEYCTL_SETPERM, key,
3395 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3396 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3397 r = log_exec_error_errno(context,
3398 p,
3399 errno,
3400 "Failed to restrict invocation ID permission: %m");
3401 }
3402 }
3403
3404 out:
3405 /* Revert back uid & gid for the last time, and exit */
3406 /* no extra logging, as only the first already reported error matters */
3407 if (getuid() != saved_uid)
3408 (void) setreuid(saved_uid, -1);
3409
3410 if (getgid() != saved_gid)
3411 (void) setregid(saved_gid, -1);
3412
3413 return r;
3414 }
3415
3416 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3417 assert(array);
3418 assert(n);
3419 assert(pair);
3420
3421 if (pair[0] >= 0)
3422 array[(*n)++] = pair[0];
3423 if (pair[1] >= 0)
3424 array[(*n)++] = pair[1];
3425 }
3426
3427 static int close_remaining_fds(
3428 const ExecParameters *params,
3429 const ExecRuntime *runtime,
3430 int socket_fd,
3431 const int *fds, size_t n_fds) {
3432
3433 size_t n_dont_close = 0;
3434 int dont_close[n_fds + 14];
3435
3436 assert(params);
3437
3438 if (params->stdin_fd >= 0)
3439 dont_close[n_dont_close++] = params->stdin_fd;
3440 if (params->stdout_fd >= 0)
3441 dont_close[n_dont_close++] = params->stdout_fd;
3442 if (params->stderr_fd >= 0)
3443 dont_close[n_dont_close++] = params->stderr_fd;
3444
3445 if (socket_fd >= 0)
3446 dont_close[n_dont_close++] = socket_fd;
3447 if (n_fds > 0) {
3448 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3449 n_dont_close += n_fds;
3450 }
3451
3452 if (runtime)
3453 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3454
3455 if (runtime && runtime->shared) {
3456 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3457 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3458 }
3459
3460 if (runtime && runtime->dynamic_creds) {
3461 if (runtime->dynamic_creds->user)
3462 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3463 if (runtime->dynamic_creds->group)
3464 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3465 }
3466
3467 if (params->user_lookup_fd >= 0)
3468 dont_close[n_dont_close++] = params->user_lookup_fd;
3469
3470 return close_all_fds(dont_close, n_dont_close);
3471 }
3472
3473 static int send_user_lookup(
3474 const char *unit_id,
3475 int user_lookup_fd,
3476 uid_t uid,
3477 gid_t gid) {
3478
3479 assert(unit_id);
3480
3481 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3482 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3483 * specified. */
3484
3485 if (user_lookup_fd < 0)
3486 return 0;
3487
3488 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3489 return 0;
3490
3491 if (writev(user_lookup_fd,
3492 (struct iovec[]) {
3493 IOVEC_MAKE(&uid, sizeof(uid)),
3494 IOVEC_MAKE(&gid, sizeof(gid)),
3495 IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3496 return -errno;
3497
3498 return 0;
3499 }
3500
3501 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3502 int r;
3503
3504 assert(c);
3505 assert(home);
3506 assert(buf);
3507
3508 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3509
3510 if (*home)
3511 return 0;
3512
3513 if (!c->working_directory_home)
3514 return 0;
3515
3516 r = get_home_dir(buf);
3517 if (r < 0)
3518 return r;
3519
3520 *home = *buf;
3521 return 1;
3522 }
3523
3524 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3525 _cleanup_strv_free_ char ** list = NULL;
3526 int r;
3527
3528 assert(c);
3529 assert(p);
3530 assert(ret);
3531
3532 assert(c->dynamic_user);
3533
3534 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3535 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3536 * directories. */
3537
3538 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3539 if (t == EXEC_DIRECTORY_CONFIGURATION)
3540 continue;
3541
3542 if (!p->prefix[t])
3543 continue;
3544
3545 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3546 char *e;
3547
3548 if (exec_directory_is_private(c, t))
3549 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3550 else
3551 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3552 if (!e)
3553 return -ENOMEM;
3554
3555 r = strv_consume(&list, e);
3556 if (r < 0)
3557 return r;
3558 }
3559 }
3560
3561 *ret = TAKE_PTR(list);
3562
3563 return 0;
3564 }
3565
3566 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3567 _cleanup_(cpu_set_reset) CPUSet s = {};
3568 int r;
3569
3570 assert(c);
3571 assert(ret);
3572
3573 if (!c->numa_policy.nodes.set) {
3574 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3575 return 0;
3576 }
3577
3578 r = numa_to_cpu_set(&c->numa_policy, &s);
3579 if (r < 0)
3580 return r;
3581
3582 cpu_set_reset(ret);
3583
3584 return cpu_set_add_all(ret, &s);
3585 }
3586
3587 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
3588 int r;
3589
3590 assert(fds);
3591 assert(n_fds);
3592 assert(*n_fds < fds_size);
3593 assert(fd);
3594
3595 if (*fd < 0)
3596 return 0;
3597
3598 if (*fd < 3 + (int) *n_fds) {
3599 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3600 * the fds we pass to the process (or which are closed only during execve). */
3601
3602 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3603 if (r < 0)
3604 return -errno;
3605
3606 close_and_replace(*fd, r);
3607 }
3608
3609 fds[(*n_fds)++] = *fd;
3610 return 1;
3611 }
3612
3613 static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3614 union sockaddr_union addr = {
3615 .un.sun_family = AF_UNIX,
3616 };
3617 socklen_t sa_len;
3618 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3619 int r;
3620
3621 assert(c);
3622 assert(p);
3623 assert(of);
3624 assert(ofd >= 0);
3625
3626 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3627 if (r < 0)
3628 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3629
3630 sa_len = r;
3631
3632 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3633 _cleanup_close_ int fd = -EBADF;
3634
3635 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3636 if (fd < 0)
3637 return log_exec_error_errno(c,
3638 p,
3639 errno,
3640 "Failed to create socket for %s: %m",
3641 of->path);
3642
3643 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3644 if (r == -EPROTOTYPE)
3645 continue;
3646 if (r < 0)
3647 return log_exec_error_errno(c,
3648 p,
3649 r,
3650 "Failed to connect socket for %s: %m",
3651 of->path);
3652
3653 return TAKE_FD(fd);
3654 }
3655
3656 return log_exec_error_errno(c,
3657 p,
3658 SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3659 of->path);
3660 }
3661
3662 static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3663 struct stat st;
3664 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3665
3666 assert(c);
3667 assert(p);
3668 assert(of);
3669
3670 ofd = open(of->path, O_PATH | O_CLOEXEC);
3671 if (ofd < 0)
3672 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3673
3674 if (fstat(ofd, &st) < 0)
3675 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3676
3677 if (S_ISSOCK(st.st_mode)) {
3678 fd = connect_unix_harder(c, p, of, ofd);
3679 if (fd < 0)
3680 return fd;
3681
3682 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3683 return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3684 of->path);
3685
3686 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3687 } else {
3688 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3689 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3690 flags |= O_APPEND;
3691 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3692 flags |= O_TRUNC;
3693
3694 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3695 if (fd < 0)
3696 return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3697
3698 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3699 }
3700
3701 return TAKE_FD(fd);
3702 }
3703
3704 static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
3705 int r;
3706
3707 assert(c);
3708 assert(p);
3709 assert(n_fds);
3710
3711 LIST_FOREACH(open_files, of, p->open_files) {
3712 _cleanup_close_ int fd = -EBADF;
3713
3714 fd = get_open_file_fd(c, p, of);
3715 if (fd < 0) {
3716 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3717 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3718 continue;
3719 }
3720
3721 return fd;
3722 }
3723
3724 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3725 return -ENOMEM;
3726
3727 r = strv_extend(&p->fd_names, of->fdname);
3728 if (r < 0)
3729 return r;
3730
3731 p->fds[*n_fds] = TAKE_FD(fd);
3732
3733 (*n_fds)++;
3734 }
3735
3736 return 0;
3737 }
3738
3739 static void log_command_line(
3740 const ExecContext *context,
3741 const ExecParameters *params,
3742 const char *msg,
3743 const char *executable,
3744 char **argv) {
3745
3746 assert(context);
3747 assert(params);
3748 assert(msg);
3749 assert(executable);
3750
3751 if (!DEBUG_LOGGING)
3752 return;
3753
3754 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3755
3756 log_exec_struct(context, params, LOG_DEBUG,
3757 "EXECUTABLE=%s", executable,
3758 LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3759 LOG_EXEC_INVOCATION_ID(params));
3760 }
3761
3762 static bool exec_context_need_unprivileged_private_users(
3763 const ExecContext *context,
3764 const ExecParameters *params) {
3765
3766 assert(context);
3767 assert(params);
3768
3769 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3770 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3771 * (system manager) then we have privileges and don't need this. */
3772 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3773 return false;
3774
3775 return context->private_users ||
3776 context->private_tmp ||
3777 context->private_devices ||
3778 context->private_network ||
3779 context->network_namespace_path ||
3780 context->private_ipc ||
3781 context->ipc_namespace_path ||
3782 context->private_mounts > 0 ||
3783 context->mount_apivfs ||
3784 context->n_bind_mounts > 0 ||
3785 context->n_temporary_filesystems > 0 ||
3786 context->root_directory ||
3787 !strv_isempty(context->extension_directories) ||
3788 context->protect_system != PROTECT_SYSTEM_NO ||
3789 context->protect_home != PROTECT_HOME_NO ||
3790 context->protect_kernel_tunables ||
3791 context->protect_kernel_modules ||
3792 context->protect_kernel_logs ||
3793 context->protect_control_groups ||
3794 context->protect_clock ||
3795 context->protect_hostname ||
3796 !strv_isempty(context->read_write_paths) ||
3797 !strv_isempty(context->read_only_paths) ||
3798 !strv_isempty(context->inaccessible_paths) ||
3799 !strv_isempty(context->exec_paths) ||
3800 !strv_isempty(context->no_exec_paths);
3801 }
3802
3803 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3804 assert(context);
3805
3806 if (confirm_spawn_disabled())
3807 return false;
3808
3809 /* For some reasons units remaining in the same process group
3810 * as PID 1 fail to acquire the console even if it's not used
3811 * by any process. So skip the confirmation question for them. */
3812 return !context->same_pgrp;
3813 }
3814
3815 static int exec_context_named_iofds(
3816 const ExecContext *c,
3817 const ExecParameters *p,
3818 int named_iofds[static 3]) {
3819
3820 size_t targets;
3821 const char* stdio_fdname[3];
3822 size_t n_fds;
3823
3824 assert(c);
3825 assert(p);
3826 assert(named_iofds);
3827
3828 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3829 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3830 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3831
3832 for (size_t i = 0; i < 3; i++)
3833 stdio_fdname[i] = exec_context_fdname(c, i);
3834
3835 n_fds = p->n_storage_fds + p->n_socket_fds;
3836
3837 for (size_t i = 0; i < n_fds && targets > 0; i++)
3838 if (named_iofds[STDIN_FILENO] < 0 &&
3839 c->std_input == EXEC_INPUT_NAMED_FD &&
3840 stdio_fdname[STDIN_FILENO] &&
3841 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3842
3843 named_iofds[STDIN_FILENO] = p->fds[i];
3844 targets--;
3845
3846 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3847 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3848 stdio_fdname[STDOUT_FILENO] &&
3849 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3850
3851 named_iofds[STDOUT_FILENO] = p->fds[i];
3852 targets--;
3853
3854 } else if (named_iofds[STDERR_FILENO] < 0 &&
3855 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3856 stdio_fdname[STDERR_FILENO] &&
3857 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3858
3859 named_iofds[STDERR_FILENO] = p->fds[i];
3860 targets--;
3861 }
3862
3863 return targets == 0 ? 0 : -ENOENT;
3864 }
3865
3866 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3867 if (!shared)
3868 return;
3869
3870 safe_close_pair(shared->netns_storage_socket);
3871 safe_close_pair(shared->ipcns_storage_socket);
3872 }
3873
3874 static void exec_runtime_close(ExecRuntime *rt) {
3875 if (!rt)
3876 return;
3877
3878 safe_close_pair(rt->ephemeral_storage_socket);
3879
3880 exec_shared_runtime_close(rt->shared);
3881 dynamic_creds_close(rt->dynamic_creds);
3882 }
3883
3884 static void exec_params_close(ExecParameters *p) {
3885 if (!p)
3886 return;
3887
3888 p->stdin_fd = safe_close(p->stdin_fd);
3889 p->stdout_fd = safe_close(p->stdout_fd);
3890 p->stderr_fd = safe_close(p->stderr_fd);
3891 }
3892
3893 int exec_invoke(
3894 const ExecCommand *command,
3895 const ExecContext *context,
3896 ExecParameters *params,
3897 ExecRuntime *runtime,
3898 const CGroupContext *cgroup_context,
3899 int *exit_status) {
3900
3901 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3902 int r, ngids = 0;
3903 _cleanup_free_ gid_t *supplementary_gids = NULL;
3904 const char *username = NULL, *groupname = NULL;
3905 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3906 const char *home = NULL, *shell = NULL;
3907 char **final_argv = NULL;
3908 dev_t journal_stream_dev = 0;
3909 ino_t journal_stream_ino = 0;
3910 bool userns_set_up = false;
3911 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3912 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3913 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3914 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3915 bool keep_seccomp_privileges = false;
3916 #if HAVE_SELINUX
3917 _cleanup_free_ char *mac_selinux_context_net = NULL;
3918 bool use_selinux = false;
3919 #endif
3920 #if ENABLE_SMACK
3921 bool use_smack = false;
3922 #endif
3923 #if HAVE_APPARMOR
3924 bool use_apparmor = false;
3925 #endif
3926 #if HAVE_SECCOMP
3927 uint64_t saved_bset = 0;
3928 #endif
3929 uid_t saved_uid = getuid();
3930 gid_t saved_gid = getgid();
3931 uid_t uid = UID_INVALID;
3932 gid_t gid = GID_INVALID;
3933 size_t n_fds, /* fds to pass to the child */
3934 n_keep_fds; /* total number of fds not to close */
3935 int secure_bits;
3936 _cleanup_free_ gid_t *gids_after_pam = NULL;
3937 int ngids_after_pam = 0;
3938
3939 int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
3940 size_t n_storage_fds, n_socket_fds;
3941
3942 assert(command);
3943 assert(context);
3944 assert(params);
3945 assert(exit_status);
3946
3947 /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
3948 * and is already applied earlier. Just for safety. */
3949 if (context->log_level_max >= 0)
3950 log_set_max_level(context->log_level_max);
3951
3952 /* Explicitly test for CVE-2021-4034 inspired invocations */
3953 if (!command->path || strv_isempty(command->argv)) {
3954 *exit_status = EXIT_EXEC;
3955 return log_exec_error_errno(
3956 context,
3957 params,
3958 SYNTHETIC_ERRNO(EINVAL),
3959 "Invalid command line arguments.");
3960 }
3961
3962 LOG_CONTEXT_PUSH_EXEC(context, params);
3963
3964 if (context->std_input == EXEC_INPUT_SOCKET ||
3965 context->std_output == EXEC_OUTPUT_SOCKET ||
3966 context->std_error == EXEC_OUTPUT_SOCKET) {
3967
3968 if (params->n_socket_fds > 1)
3969 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
3970
3971 if (params->n_socket_fds == 0)
3972 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
3973
3974 socket_fd = params->fds[0];
3975 n_storage_fds = n_socket_fds = 0;
3976 } else {
3977 n_socket_fds = params->n_socket_fds;
3978 n_storage_fds = params->n_storage_fds;
3979 }
3980 n_fds = n_socket_fds + n_storage_fds;
3981
3982 r = exec_context_named_iofds(context, params, named_iofds);
3983 if (r < 0)
3984 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
3985
3986 rename_process_from_path(command->path);
3987
3988 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3989 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3990 * both of which will be demoted to SIG_DFL. */
3991 (void) default_signals(SIGNALS_CRASH_HANDLER,
3992 SIGNALS_IGNORE);
3993
3994 if (context->ignore_sigpipe)
3995 (void) ignore_signals(SIGPIPE);
3996
3997 r = reset_signal_mask();
3998 if (r < 0) {
3999 *exit_status = EXIT_SIGNAL_MASK;
4000 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4001 }
4002
4003 if (params->idle_pipe)
4004 do_idle_pipe_dance(params->idle_pipe);
4005
4006 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4007 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4008 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4009 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4010
4011 log_forget_fds();
4012 log_set_open_when_needed(true);
4013 log_settle_target();
4014
4015 /* In case anything used libc syslog(), close this here, too */
4016 closelog();
4017
4018 r = collect_open_file_fds(context, params, &n_fds);
4019 if (r < 0) {
4020 *exit_status = EXIT_FDS;
4021 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4022 }
4023
4024 int keep_fds[n_fds + 3];
4025 memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4026 n_keep_fds = n_fds;
4027
4028 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4029 if (r < 0) {
4030 *exit_status = EXIT_FDS;
4031 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4032 }
4033
4034 #if HAVE_LIBBPF
4035 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_outer_map_fd);
4036 if (r < 0) {
4037 *exit_status = EXIT_FDS;
4038 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4039 }
4040 #endif
4041
4042 r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4043 if (r < 0) {
4044 *exit_status = EXIT_FDS;
4045 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4046 }
4047
4048 if (!context->same_pgrp &&
4049 setsid() < 0) {
4050 *exit_status = EXIT_SETSID;
4051 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4052 }
4053
4054 exec_context_tty_reset(context, params);
4055
4056 if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4057 _cleanup_free_ char *cmdline = NULL;
4058
4059 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4060 if (!cmdline) {
4061 *exit_status = EXIT_MEMORY;
4062 return log_oom();
4063 }
4064
4065 r = ask_for_confirmation(context, params, cmdline);
4066 if (r != CONFIRM_EXECUTE) {
4067 if (r == CONFIRM_PRETEND_SUCCESS) {
4068 *exit_status = EXIT_SUCCESS;
4069 return 0;
4070 }
4071
4072 *exit_status = EXIT_CONFIRM;
4073 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4074 "Execution cancelled by the user");
4075 }
4076 }
4077
4078 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4079 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4080 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4081 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4082 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4083 if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4084 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4085 *exit_status = EXIT_MEMORY;
4086 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4087 }
4088
4089 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4090 _cleanup_strv_free_ char **suggested_paths = NULL;
4091
4092 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4093 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4094 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4095 *exit_status = EXIT_USER;
4096 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4097 }
4098
4099 r = compile_suggested_paths(context, params, &suggested_paths);
4100 if (r < 0) {
4101 *exit_status = EXIT_MEMORY;
4102 return log_oom();
4103 }
4104
4105 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4106 if (r < 0) {
4107 *exit_status = EXIT_USER;
4108 if (r == -EILSEQ)
4109 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4110 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4111 return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4112 }
4113
4114 if (!uid_is_valid(uid)) {
4115 *exit_status = EXIT_USER;
4116 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4117 }
4118
4119 if (!gid_is_valid(gid)) {
4120 *exit_status = EXIT_USER;
4121 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4122 }
4123
4124 if (runtime->dynamic_creds->user)
4125 username = runtime->dynamic_creds->user->name;
4126
4127 } else {
4128 if (context->user) {
4129 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4130 if (r < 0) {
4131 *exit_status = EXIT_USER;
4132 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4133 }
4134 }
4135
4136 if (context->group) {
4137 r = get_fixed_group(context->group, &groupname, &gid);
4138 if (r < 0) {
4139 *exit_status = EXIT_GROUP;
4140 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4141 }
4142 }
4143 }
4144
4145 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4146 r = get_supplementary_groups(context, username, groupname, gid,
4147 &supplementary_gids, &ngids);
4148 if (r < 0) {
4149 *exit_status = EXIT_GROUP;
4150 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4151 }
4152
4153 r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4154 if (r < 0) {
4155 *exit_status = EXIT_USER;
4156 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4157 }
4158
4159 params->user_lookup_fd = safe_close(params->user_lookup_fd);
4160
4161 r = acquire_home(context, uid, &home, &home_buffer);
4162 if (r < 0) {
4163 *exit_status = EXIT_CHDIR;
4164 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4165 }
4166
4167 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4168 if (socket_fd >= 0)
4169 (void) fd_nonblock(socket_fd, false);
4170
4171 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4172 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4173 if (params->cgroup_path) {
4174 _cleanup_free_ char *p = NULL;
4175
4176 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4177 if (r < 0) {
4178 *exit_status = EXIT_CGROUP;
4179 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4180 }
4181
4182 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4183 if (r == -EUCLEAN) {
4184 *exit_status = EXIT_CGROUP;
4185 return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4186 "because the cgroup or one of its parents or "
4187 "siblings is in the threaded mode: %m", p);
4188 }
4189 if (r < 0) {
4190 *exit_status = EXIT_CGROUP;
4191 return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4192 }
4193 }
4194
4195 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4196 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4197 if (r < 0) {
4198 *exit_status = EXIT_NETWORK;
4199 return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4200 }
4201 }
4202
4203 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4204 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4205 if (r < 0) {
4206 *exit_status = EXIT_NAMESPACE;
4207 return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4208 }
4209 }
4210
4211 r = setup_input(context, params, socket_fd, named_iofds);
4212 if (r < 0) {
4213 *exit_status = EXIT_STDIN;
4214 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4215 }
4216
4217 r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4218 if (r < 0) {
4219 *exit_status = EXIT_STDOUT;
4220 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4221 }
4222
4223 r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4224 if (r < 0) {
4225 *exit_status = EXIT_STDERR;
4226 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4227 }
4228
4229 if (context->oom_score_adjust_set) {
4230 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4231 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4232 r = set_oom_score_adjust(context->oom_score_adjust);
4233 if (ERRNO_IS_NEG_PRIVILEGE(r))
4234 log_exec_debug_errno(context, params, r,
4235 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4236 else if (r < 0) {
4237 *exit_status = EXIT_OOM_ADJUST;
4238 return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4239 }
4240 }
4241
4242 if (context->coredump_filter_set) {
4243 r = set_coredump_filter(context->coredump_filter);
4244 if (ERRNO_IS_NEG_PRIVILEGE(r))
4245 log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4246 else if (r < 0) {
4247 *exit_status = EXIT_LIMITS;
4248 return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4249 }
4250 }
4251
4252 if (context->nice_set) {
4253 r = setpriority_closest(context->nice);
4254 if (r < 0) {
4255 *exit_status = EXIT_NICE;
4256 return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4257 }
4258 }
4259
4260 if (context->cpu_sched_set) {
4261 struct sched_param param = {
4262 .sched_priority = context->cpu_sched_priority,
4263 };
4264
4265 r = sched_setscheduler(0,
4266 context->cpu_sched_policy |
4267 (context->cpu_sched_reset_on_fork ?
4268 SCHED_RESET_ON_FORK : 0),
4269 &param);
4270 if (r < 0) {
4271 *exit_status = EXIT_SETSCHEDULER;
4272 return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4273 }
4274 }
4275
4276 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4277 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4278 const CPUSet *cpu_set;
4279
4280 if (context->cpu_affinity_from_numa) {
4281 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4282 if (r < 0) {
4283 *exit_status = EXIT_CPUAFFINITY;
4284 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4285 }
4286
4287 cpu_set = &converted_cpu_set;
4288 } else
4289 cpu_set = &context->cpu_set;
4290
4291 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4292 *exit_status = EXIT_CPUAFFINITY;
4293 return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4294 }
4295 }
4296
4297 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4298 r = apply_numa_policy(&context->numa_policy);
4299 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4300 log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4301 else if (r < 0) {
4302 *exit_status = EXIT_NUMA_POLICY;
4303 return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4304 }
4305 }
4306
4307 if (context->ioprio_set)
4308 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4309 *exit_status = EXIT_IOPRIO;
4310 return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4311 }
4312
4313 if (context->timer_slack_nsec != NSEC_INFINITY)
4314 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4315 *exit_status = EXIT_TIMERSLACK;
4316 return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4317 }
4318
4319 if (context->personality != PERSONALITY_INVALID) {
4320 r = safe_personality(context->personality);
4321 if (r < 0) {
4322 *exit_status = EXIT_PERSONALITY;
4323 return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4324 }
4325 }
4326
4327 #if ENABLE_UTMP
4328 if (context->utmp_id) {
4329 const char *line = context->tty_path ?
4330 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4331 NULL;
4332 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4333 line,
4334 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4335 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4336 USER_PROCESS,
4337 username);
4338 }
4339 #endif
4340
4341 if (uid_is_valid(uid)) {
4342 r = chown_terminal(STDIN_FILENO, uid);
4343 if (r < 0) {
4344 *exit_status = EXIT_STDIN;
4345 return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4346 }
4347 }
4348
4349 if (params->cgroup_path) {
4350 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4351 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4352 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4353 * touch a single hierarchy too. */
4354
4355 if (params->flags & EXEC_CGROUP_DELEGATE) {
4356 _cleanup_free_ char *p = NULL;
4357
4358 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4359 if (r < 0) {
4360 *exit_status = EXIT_CGROUP;
4361 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4362 }
4363
4364 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4365 if (r < 0) {
4366 *exit_status = EXIT_CGROUP;
4367 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4368 }
4369 if (r > 0) {
4370 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4371 if (r < 0) {
4372 *exit_status = EXIT_CGROUP;
4373 return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4374 }
4375 }
4376 }
4377
4378 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4379 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4380 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4381 if (r < 0) {
4382 *exit_status = EXIT_MEMORY;
4383 return log_oom();
4384 }
4385
4386 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4387 if (r < 0) {
4388 log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4389 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4390 memory_pressure_path = mfree(memory_pressure_path);
4391 }
4392 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4393 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4394 if (!memory_pressure_path) {
4395 *exit_status = EXIT_MEMORY;
4396 return log_oom();
4397 }
4398 }
4399 }
4400 }
4401
4402 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4403
4404 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4405 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4406 if (r < 0)
4407 return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4408 }
4409
4410 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4411 r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4412 if (r < 0) {
4413 *exit_status = EXIT_CREDENTIALS;
4414 return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4415 }
4416 }
4417
4418 r = build_environment(
4419 context,
4420 params,
4421 cgroup_context,
4422 n_fds,
4423 home,
4424 username,
4425 shell,
4426 journal_stream_dev,
4427 journal_stream_ino,
4428 memory_pressure_path,
4429 &our_env);
4430 if (r < 0) {
4431 *exit_status = EXIT_MEMORY;
4432 return log_oom();
4433 }
4434
4435 r = build_pass_environment(context, &pass_env);
4436 if (r < 0) {
4437 *exit_status = EXIT_MEMORY;
4438 return log_oom();
4439 }
4440
4441 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4442 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4443 * not specify PATH but the unit has ExecSearchPath. */
4444 if (!strv_isempty(context->exec_search_path)) {
4445 _cleanup_free_ char *joined = NULL;
4446
4447 joined = strv_join(context->exec_search_path, ":");
4448 if (!joined) {
4449 *exit_status = EXIT_MEMORY;
4450 return log_oom();
4451 }
4452
4453 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4454 if (r < 0) {
4455 *exit_status = EXIT_MEMORY;
4456 return log_oom();
4457 }
4458 }
4459
4460 accum_env = strv_env_merge(params->environment,
4461 our_env,
4462 joined_exec_search_path,
4463 pass_env,
4464 context->environment,
4465 params->files_env);
4466 if (!accum_env) {
4467 *exit_status = EXIT_MEMORY;
4468 return log_oom();
4469 }
4470 accum_env = strv_env_clean(accum_env);
4471
4472 (void) umask(context->umask);
4473
4474 r = setup_keyring(context, params, uid, gid);
4475 if (r < 0) {
4476 *exit_status = EXIT_KEYRING;
4477 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4478 }
4479
4480 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4481 * from it. */
4482 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4483
4484 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4485 * for it, and the kernel doesn't actually support ambient caps. */
4486 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4487
4488 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4489 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4490 * desired. */
4491 if (needs_ambient_hack)
4492 needs_setuid = false;
4493 else
4494 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4495
4496 uint64_t capability_ambient_set = context->capability_ambient_set;
4497
4498 if (needs_sandboxing) {
4499 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4500 * /sys being present. The actual MAC context application will happen later, as late as
4501 * possible, to avoid impacting our own code paths. */
4502
4503 #if HAVE_SELINUX
4504 use_selinux = mac_selinux_use();
4505 #endif
4506 #if ENABLE_SMACK
4507 use_smack = mac_smack_use();
4508 #endif
4509 #if HAVE_APPARMOR
4510 use_apparmor = mac_apparmor_use();
4511 #endif
4512 }
4513
4514 if (needs_sandboxing) {
4515 int which_failed;
4516
4517 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4518 * is set here. (See below.) */
4519
4520 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4521 if (r < 0) {
4522 *exit_status = EXIT_LIMITS;
4523 return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4524 }
4525 }
4526
4527 if (needs_setuid && context->pam_name && username) {
4528 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4529 * wins here. (See above.) */
4530
4531 /* All fds passed in the fds array will be closed in the pam child process. */
4532 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds);
4533 if (r < 0) {
4534 *exit_status = EXIT_PAM;
4535 return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4536 }
4537
4538 if (ambient_capabilities_supported()) {
4539 uint64_t ambient_after_pam;
4540
4541 /* PAM modules might have set some ambient caps. Query them here and merge them into
4542 * the caps we want to set in the end, so that we don't end up unsetting them. */
4543 r = capability_get_ambient(&ambient_after_pam);
4544 if (r < 0) {
4545 *exit_status = EXIT_CAPABILITIES;
4546 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4547 }
4548
4549 capability_ambient_set |= ambient_after_pam;
4550 }
4551
4552 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4553 if (ngids_after_pam < 0) {
4554 *exit_status = EXIT_GROUP;
4555 return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4556 }
4557 }
4558
4559 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4560 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4561 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4562 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4563
4564 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4565 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4566 * the actual requested operations fail (or silently continue). */
4567 if (r < 0 && context->private_users) {
4568 *exit_status = EXIT_USER;
4569 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4570 }
4571 if (r < 0)
4572 log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4573 else
4574 userns_set_up = true;
4575 }
4576
4577 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4578
4579 /* Try to enable network namespacing if network namespacing is available and we have
4580 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4581 * new network namespace. And if we don't have that, then we could only create a network
4582 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4583 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4584 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4585 if (ERRNO_IS_NEG_PRIVILEGE(r))
4586 log_exec_notice_errno(context, params, r,
4587 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4588 else if (r < 0) {
4589 *exit_status = EXIT_NETWORK;
4590 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4591 }
4592 } else if (context->network_namespace_path) {
4593 *exit_status = EXIT_NETWORK;
4594 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4595 "NetworkNamespacePath= is not supported, refusing.");
4596 } else
4597 log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4598 }
4599
4600 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4601
4602 if (ns_type_supported(NAMESPACE_IPC)) {
4603 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4604 if (r == -EPERM)
4605 log_exec_warning_errno(context, params, r,
4606 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4607 else if (r < 0) {
4608 *exit_status = EXIT_NAMESPACE;
4609 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4610 }
4611 } else if (context->ipc_namespace_path) {
4612 *exit_status = EXIT_NAMESPACE;
4613 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4614 "IPCNamespacePath= is not supported, refusing.");
4615 } else
4616 log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4617 }
4618
4619 if (needs_mount_namespace) {
4620 _cleanup_free_ char *error_path = NULL;
4621
4622 r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path);
4623 if (r < 0) {
4624 *exit_status = EXIT_NAMESPACE;
4625 return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4626 error_path ? ": " : "", strempty(error_path));
4627 }
4628 }
4629
4630 if (needs_sandboxing) {
4631 r = apply_protect_hostname(context, params, exit_status);
4632 if (r < 0)
4633 return r;
4634 }
4635
4636 if (context->memory_ksm >= 0)
4637 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4638 if (ERRNO_IS_NOT_SUPPORTED(errno))
4639 log_exec_debug_errno(context,
4640 params,
4641 errno,
4642 "KSM support not available, ignoring.");
4643 else {
4644 *exit_status = EXIT_KSM;
4645 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4646 }
4647 }
4648
4649 /* Drop groups as early as possible.
4650 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4651 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4652 if (needs_setuid) {
4653 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4654 int ngids_to_enforce = 0;
4655
4656 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4657 ngids,
4658 gids_after_pam,
4659 ngids_after_pam,
4660 &gids_to_enforce);
4661 if (ngids_to_enforce < 0) {
4662 *exit_status = EXIT_GROUP;
4663 return log_exec_error_errno(context, params,
4664 ngids_to_enforce,
4665 "Failed to merge group lists. Group membership might be incorrect: %m");
4666 }
4667
4668 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4669 if (r < 0) {
4670 *exit_status = EXIT_GROUP;
4671 return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4672 }
4673 }
4674
4675 /* If the user namespace was not set up above, try to do it now.
4676 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4677 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4678 * case of mount namespaces being less privileged when the mount point list is copied from a
4679 * different user namespace). */
4680
4681 if (needs_sandboxing && context->private_users && !userns_set_up) {
4682 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4683 if (r < 0) {
4684 *exit_status = EXIT_USER;
4685 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4686 }
4687 }
4688
4689 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4690 * shall execute. */
4691
4692 _cleanup_free_ char *executable = NULL;
4693 _cleanup_close_ int executable_fd = -EBADF;
4694 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4695 if (r < 0) {
4696 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4697 log_exec_struct_errno(context, params, LOG_INFO, r,
4698 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4699 LOG_EXEC_INVOCATION_ID(params),
4700 LOG_EXEC_MESSAGE(params,
4701 "Executable %s missing, skipping: %m",
4702 command->path),
4703 "EXECUTABLE=%s", command->path);
4704 *exit_status = EXIT_SUCCESS;
4705 return 0;
4706 }
4707
4708 *exit_status = EXIT_EXEC;
4709 return log_exec_struct_errno(context, params, LOG_INFO, r,
4710 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4711 LOG_EXEC_INVOCATION_ID(params),
4712 LOG_EXEC_MESSAGE(params,
4713 "Failed to locate executable %s: %m",
4714 command->path),
4715 "EXECUTABLE=%s", command->path);
4716 }
4717
4718 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
4719 if (r < 0) {
4720 *exit_status = EXIT_FDS;
4721 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4722 }
4723
4724 #if HAVE_SELINUX
4725 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4726 int fd = -EBADF;
4727
4728 if (socket_fd >= 0)
4729 fd = socket_fd;
4730 else if (params->n_socket_fds == 1)
4731 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4732 * use context from that fd to compute the label. */
4733 fd = params->fds[0];
4734
4735 if (fd >= 0) {
4736 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4737 if (r < 0) {
4738 if (!context->selinux_context_ignore) {
4739 *exit_status = EXIT_SELINUX_CONTEXT;
4740 return log_exec_error_errno(context,
4741 params,
4742 r,
4743 "Failed to determine SELinux context: %m");
4744 }
4745 log_exec_debug_errno(context,
4746 params,
4747 r,
4748 "Failed to determine SELinux context, ignoring: %m");
4749 }
4750 }
4751 }
4752 #endif
4753
4754 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4755 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4756 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4757 * execve(). But first, close the remaining sockets in the context objects. */
4758
4759 exec_runtime_close(runtime);
4760 exec_params_close(params);
4761
4762 r = close_all_fds(keep_fds, n_keep_fds);
4763 if (r >= 0)
4764 r = shift_fds(params->fds, n_fds);
4765 if (r >= 0)
4766 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
4767 if (r < 0) {
4768 *exit_status = EXIT_FDS;
4769 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4770 }
4771
4772 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4773 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4774 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4775 * came this far. */
4776
4777 secure_bits = context->secure_bits;
4778
4779 if (needs_sandboxing) {
4780 uint64_t bset;
4781
4782 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4783 * (Note this is placed after the general resource limit initialization, see above, in order
4784 * to take precedence.) */
4785 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4786 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4787 *exit_status = EXIT_LIMITS;
4788 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4789 }
4790 }
4791
4792 #if ENABLE_SMACK
4793 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4794 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4795 if (use_smack && context->smack_process_label) {
4796 r = setup_smack(params, context, executable_fd);
4797 if (r < 0 && !context->smack_process_label_ignore) {
4798 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4799 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4800 }
4801 }
4802 #endif
4803
4804 bset = context->capability_bounding_set;
4805 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4806 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4807 * instead of us doing that */
4808 if (needs_ambient_hack)
4809 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4810 (UINT64_C(1) << CAP_SETUID) |
4811 (UINT64_C(1) << CAP_SETGID);
4812
4813 #if HAVE_SECCOMP
4814 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4815 * keep the needed privileges to apply it even if we're not root. */
4816 if (needs_setuid &&
4817 uid_is_valid(uid) &&
4818 context_has_seccomp(context) &&
4819 seccomp_allows_drop_privileges(context)) {
4820 keep_seccomp_privileges = true;
4821
4822 if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4823 *exit_status = EXIT_USER;
4824 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4825 }
4826
4827 /* Save the current bounding set so we can restore it after applying the seccomp
4828 * filter */
4829 saved_bset = bset;
4830 bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4831 (UINT64_C(1) << CAP_SETPCAP);
4832 }
4833 #endif
4834
4835 if (!cap_test_all(bset)) {
4836 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4837 if (r < 0) {
4838 *exit_status = EXIT_CAPABILITIES;
4839 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4840 }
4841 }
4842
4843 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4844 * keep-caps set.
4845 *
4846 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4847 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4848 * the ambient capabilities can be raised as they are present in the permitted and
4849 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4850 * without changing the user, so we also set the ambient capabilities here.
4851 *
4852 * The requested ambient capabilities are raised in the inheritable set if the second
4853 * argument is true. */
4854 if (!needs_ambient_hack) {
4855 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4856 if (r < 0) {
4857 *exit_status = EXIT_CAPABILITIES;
4858 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4859 }
4860 }
4861 }
4862
4863 /* chroot to root directory first, before we lose the ability to chroot */
4864 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4865 if (r < 0)
4866 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
4867
4868 if (needs_setuid) {
4869 if (uid_is_valid(uid)) {
4870 r = enforce_user(context, uid, capability_ambient_set);
4871 if (r < 0) {
4872 *exit_status = EXIT_USER;
4873 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
4874 }
4875
4876 if (keep_seccomp_privileges) {
4877 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
4878 r = drop_capability(CAP_SETUID);
4879 if (r < 0) {
4880 *exit_status = EXIT_USER;
4881 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
4882 }
4883 }
4884
4885 r = keep_capability(CAP_SYS_ADMIN);
4886 if (r < 0) {
4887 *exit_status = EXIT_USER;
4888 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
4889 }
4890
4891 r = keep_capability(CAP_SETPCAP);
4892 if (r < 0) {
4893 *exit_status = EXIT_USER;
4894 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
4895 }
4896 }
4897
4898 if (!needs_ambient_hack && capability_ambient_set != 0) {
4899
4900 /* Raise the ambient capabilities after user change. */
4901 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4902 if (r < 0) {
4903 *exit_status = EXIT_CAPABILITIES;
4904 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
4905 }
4906 }
4907 }
4908 }
4909
4910 /* Apply working directory here, because the working directory might be on NFS and only the user running
4911 * this service might have the correct privilege to change to the working directory */
4912 r = apply_working_directory(context, params, runtime, home, exit_status);
4913 if (r < 0)
4914 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
4915
4916 if (needs_sandboxing) {
4917 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4918 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4919 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4920 * are restricted. */
4921
4922 #if HAVE_SELINUX
4923 if (use_selinux) {
4924 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4925
4926 if (exec_context) {
4927 r = setexeccon(exec_context);
4928 if (r < 0) {
4929 if (!context->selinux_context_ignore) {
4930 *exit_status = EXIT_SELINUX_CONTEXT;
4931 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
4932 }
4933 log_exec_debug_errno(context,
4934 params,
4935 r,
4936 "Failed to change SELinux context to %s, ignoring: %m",
4937 exec_context);
4938 }
4939 }
4940 }
4941 #endif
4942
4943 #if HAVE_APPARMOR
4944 if (use_apparmor && context->apparmor_profile) {
4945 r = aa_change_onexec(context->apparmor_profile);
4946 if (r < 0 && !context->apparmor_profile_ignore) {
4947 *exit_status = EXIT_APPARMOR_PROFILE;
4948 return log_exec_error_errno(context,
4949 params,
4950 errno,
4951 "Failed to prepare AppArmor profile change to %s: %m",
4952 context->apparmor_profile);
4953 }
4954 }
4955 #endif
4956
4957 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4958 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4959 * requires CAP_SETPCAP. */
4960 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4961 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4962 * effective set here.
4963 *
4964 * The effective set is overwritten during execve() with the following values:
4965 *
4966 * - ambient set (for non-root processes)
4967 *
4968 * - (inheritable | bounding) set for root processes)
4969 *
4970 * Hence there is no security impact to raise it in the effective set before execve
4971 */
4972 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4973 if (r < 0) {
4974 *exit_status = EXIT_CAPABILITIES;
4975 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4976 }
4977 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4978 *exit_status = EXIT_SECUREBITS;
4979 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
4980 }
4981 }
4982
4983 if (context_has_no_new_privileges(context))
4984 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4985 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4986 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
4987 }
4988
4989 #if HAVE_SECCOMP
4990 r = apply_address_families(context, params);
4991 if (r < 0) {
4992 *exit_status = EXIT_ADDRESS_FAMILIES;
4993 return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
4994 }
4995
4996 r = apply_memory_deny_write_execute(context, params);
4997 if (r < 0) {
4998 *exit_status = EXIT_SECCOMP;
4999 return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5000 }
5001
5002 r = apply_restrict_realtime(context, params);
5003 if (r < 0) {
5004 *exit_status = EXIT_SECCOMP;
5005 return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5006 }
5007
5008 r = apply_restrict_suid_sgid(context, params);
5009 if (r < 0) {
5010 *exit_status = EXIT_SECCOMP;
5011 return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5012 }
5013
5014 r = apply_restrict_namespaces(context, params);
5015 if (r < 0) {
5016 *exit_status = EXIT_SECCOMP;
5017 return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5018 }
5019
5020 r = apply_protect_sysctl(context, params);
5021 if (r < 0) {
5022 *exit_status = EXIT_SECCOMP;
5023 return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5024 }
5025
5026 r = apply_protect_kernel_modules(context, params);
5027 if (r < 0) {
5028 *exit_status = EXIT_SECCOMP;
5029 return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5030 }
5031
5032 r = apply_protect_kernel_logs(context, params);
5033 if (r < 0) {
5034 *exit_status = EXIT_SECCOMP;
5035 return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5036 }
5037
5038 r = apply_protect_clock(context, params);
5039 if (r < 0) {
5040 *exit_status = EXIT_SECCOMP;
5041 return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5042 }
5043
5044 r = apply_private_devices(context, params);
5045 if (r < 0) {
5046 *exit_status = EXIT_SECCOMP;
5047 return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5048 }
5049
5050 r = apply_syscall_archs(context, params);
5051 if (r < 0) {
5052 *exit_status = EXIT_SECCOMP;
5053 return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5054 }
5055
5056 r = apply_lock_personality(context, params);
5057 if (r < 0) {
5058 *exit_status = EXIT_SECCOMP;
5059 return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5060 }
5061
5062 r = apply_syscall_log(context, params);
5063 if (r < 0) {
5064 *exit_status = EXIT_SECCOMP;
5065 return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5066 }
5067 #endif
5068
5069 #if HAVE_LIBBPF
5070 r = apply_restrict_filesystems(context, params);
5071 if (r < 0) {
5072 *exit_status = EXIT_BPF;
5073 return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5074 }
5075 #endif
5076
5077 #if HAVE_SECCOMP
5078 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5079 * by the filter as little as possible. */
5080 r = apply_syscall_filter(context, params, needs_ambient_hack);
5081 if (r < 0) {
5082 *exit_status = EXIT_SECCOMP;
5083 return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5084 }
5085
5086 if (keep_seccomp_privileges) {
5087 /* Restore the capability bounding set with what's expected from the service + the
5088 * ambient capabilities hack */
5089 if (!cap_test_all(saved_bset)) {
5090 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5091 if (r < 0) {
5092 *exit_status = EXIT_CAPABILITIES;
5093 return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5094 }
5095 }
5096
5097 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5098 * applications that use it. */
5099 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5100 r = drop_capability(CAP_SYS_ADMIN);
5101 if (r < 0) {
5102 *exit_status = EXIT_USER;
5103 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5104 }
5105 }
5106
5107 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5108 * applications that use it. */
5109 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5110 r = drop_capability(CAP_SETPCAP);
5111 if (r < 0) {
5112 *exit_status = EXIT_USER;
5113 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5114 }
5115 }
5116
5117 if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5118 *exit_status = EXIT_USER;
5119 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5120 }
5121 }
5122 #endif
5123
5124 }
5125
5126 if (!strv_isempty(context->unset_environment)) {
5127 char **ee = NULL;
5128
5129 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5130 if (!ee) {
5131 *exit_status = EXIT_MEMORY;
5132 return log_oom();
5133 }
5134
5135 strv_free_and_replace(accum_env, ee);
5136 }
5137
5138 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5139 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5140
5141 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5142 if (r < 0) {
5143 *exit_status = EXIT_MEMORY;
5144 return log_exec_error_errno(context,
5145 params,
5146 r,
5147 "Failed to replace environment variables: %m");
5148 }
5149 final_argv = replaced_argv;
5150
5151 if (!strv_isempty(unset_variables)) {
5152 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5153 log_exec_warning(context,
5154 params,
5155 "Referenced but unset environment variable evaluates to an empty string: %s",
5156 strna(ju));
5157 }
5158
5159 if (!strv_isempty(bad_variables)) {
5160 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5161 log_exec_warning(context,
5162 params,
5163 "Invalid environment variable name evaluates to an empty string: %s",
5164 strna(jb));
5165 }
5166 } else
5167 final_argv = command->argv;
5168
5169 log_command_line(context, params, "Executing", executable, final_argv);
5170
5171 if (params->exec_fd >= 0) {
5172 uint8_t hot = 1;
5173
5174 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5175 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5176
5177 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5178 *exit_status = EXIT_EXEC;
5179 return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5180 }
5181 }
5182
5183 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5184
5185 if (params->exec_fd >= 0) {
5186 uint8_t hot = 0;
5187
5188 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5189 * that POLLHUP on it no longer means execve() succeeded. */
5190
5191 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5192 *exit_status = EXIT_EXEC;
5193 return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5194 }
5195 }
5196
5197 *exit_status = EXIT_EXEC;
5198 return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5199 }