]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/exec-invoke.c
Merge pull request #31464 from poettering/vmspawn-limit-bank
[thirdparty/systemd.git] / src / core / exec-invoke.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/eventfd.h>
4 #include <sys/ioctl.h>
5 #include <sys/mount.h>
6 #include <sys/prctl.h>
7
8 #if HAVE_PAM
9 #include <security/pam_appl.h>
10 #include <security/pam_misc.h>
11 #endif
12
13 #if HAVE_APPARMOR
14 #include <sys/apparmor.h>
15 #endif
16
17 #include "sd-messages.h"
18
19 #if HAVE_APPARMOR
20 #include "apparmor-util.h"
21 #endif
22 #include "argv-util.h"
23 #include "barrier.h"
24 #include "bpf-dlopen.h"
25 #include "bpf-restrict-fs.h"
26 #include "btrfs-util.h"
27 #include "capability-util.h"
28 #include "cgroup-setup.h"
29 #include "chase.h"
30 #include "chattr-util.h"
31 #include "chown-recursive.h"
32 #include "copy.h"
33 #include "data-fd-util.h"
34 #include "env-util.h"
35 #include "escape.h"
36 #include "exec-credential.h"
37 #include "exec-invoke.h"
38 #include "execute.h"
39 #include "exit-status.h"
40 #include "fd-util.h"
41 #include "hexdecoct.h"
42 #include "io-util.h"
43 #include "iovec-util.h"
44 #include "missing_ioprio.h"
45 #include "missing_prctl.h"
46 #include "missing_securebits.h"
47 #include "missing_syscall.h"
48 #include "mkdir-label.h"
49 #include "proc-cmdline.h"
50 #include "process-util.h"
51 #include "psi-util.h"
52 #include "rlimit-util.h"
53 #include "seccomp-util.h"
54 #include "selinux-util.h"
55 #include "signal-util.h"
56 #include "smack-util.h"
57 #include "socket-util.h"
58 #include "string-table.h"
59 #include "strv.h"
60 #include "terminal-util.h"
61 #include "utmp-wtmp.h"
62 #include "vpick.h"
63
64 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
65 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
66
67 #define SNDBUF_SIZE (8*1024*1024)
68
69 static int flag_fds(
70 const int fds[],
71 size_t n_socket_fds,
72 size_t n_fds,
73 bool nonblock) {
74
75 int r;
76
77 assert(fds || n_fds == 0);
78
79 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
80 * O_NONBLOCK only applies to socket activation though. */
81
82 for (size_t i = 0; i < n_fds; i++) {
83
84 if (i < n_socket_fds) {
85 r = fd_nonblock(fds[i], nonblock);
86 if (r < 0)
87 return r;
88 }
89
90 /* We unconditionally drop FD_CLOEXEC from the fds,
91 * since after all we want to pass these fds to our
92 * children */
93
94 r = fd_cloexec(fds[i], false);
95 if (r < 0)
96 return r;
97 }
98
99 return 0;
100 }
101
102 static bool is_terminal_input(ExecInput i) {
103 return IN_SET(i,
104 EXEC_INPUT_TTY,
105 EXEC_INPUT_TTY_FORCE,
106 EXEC_INPUT_TTY_FAIL);
107 }
108
109 static bool is_terminal_output(ExecOutput o) {
110 return IN_SET(o,
111 EXEC_OUTPUT_TTY,
112 EXEC_OUTPUT_KMSG_AND_CONSOLE,
113 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
114 }
115
116 static bool is_kmsg_output(ExecOutput o) {
117 return IN_SET(o,
118 EXEC_OUTPUT_KMSG,
119 EXEC_OUTPUT_KMSG_AND_CONSOLE);
120 }
121
122 static bool exec_context_needs_term(const ExecContext *c) {
123 assert(c);
124
125 /* Return true if the execution context suggests we should set $TERM to something useful. */
126
127 if (is_terminal_input(c->std_input))
128 return true;
129
130 if (is_terminal_output(c->std_output))
131 return true;
132
133 if (is_terminal_output(c->std_error))
134 return true;
135
136 return !!c->tty_path;
137 }
138
139 static int open_null_as(int flags, int nfd) {
140 int fd;
141
142 assert(nfd >= 0);
143
144 fd = open("/dev/null", flags|O_NOCTTY);
145 if (fd < 0)
146 return -errno;
147
148 return move_fd(fd, nfd, false);
149 }
150
151 static int connect_journal_socket(
152 int fd,
153 const char *log_namespace,
154 uid_t uid,
155 gid_t gid) {
156
157 uid_t olduid = UID_INVALID;
158 gid_t oldgid = GID_INVALID;
159 const char *j;
160 int r;
161
162 j = log_namespace ?
163 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
164 "/run/systemd/journal/stdout";
165
166 if (gid_is_valid(gid)) {
167 oldgid = getgid();
168
169 if (setegid(gid) < 0)
170 return -errno;
171 }
172
173 if (uid_is_valid(uid)) {
174 olduid = getuid();
175
176 if (seteuid(uid) < 0) {
177 r = -errno;
178 goto restore_gid;
179 }
180 }
181
182 r = connect_unix_path(fd, AT_FDCWD, j);
183
184 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
185 an LSM interferes. */
186
187 if (uid_is_valid(uid))
188 (void) seteuid(olduid);
189
190 restore_gid:
191 if (gid_is_valid(gid))
192 (void) setegid(oldgid);
193
194 return r;
195 }
196
197 static int connect_logger_as(
198 const ExecContext *context,
199 const ExecParameters *params,
200 ExecOutput output,
201 const char *ident,
202 int nfd,
203 uid_t uid,
204 gid_t gid) {
205
206 _cleanup_close_ int fd = -EBADF;
207 int r;
208
209 assert(context);
210 assert(params);
211 assert(output < _EXEC_OUTPUT_MAX);
212 assert(ident);
213 assert(nfd >= 0);
214
215 fd = socket(AF_UNIX, SOCK_STREAM, 0);
216 if (fd < 0)
217 return -errno;
218
219 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
220 if (r < 0)
221 return r;
222
223 if (shutdown(fd, SHUT_RD) < 0)
224 return -errno;
225
226 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
227
228 if (dprintf(fd,
229 "%s\n"
230 "%s\n"
231 "%i\n"
232 "%i\n"
233 "%i\n"
234 "%i\n"
235 "%i\n",
236 context->syslog_identifier ?: ident,
237 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
238 context->syslog_priority,
239 !!context->syslog_level_prefix,
240 false,
241 is_kmsg_output(output),
242 is_terminal_output(output)) < 0)
243 return -errno;
244
245 return move_fd(TAKE_FD(fd), nfd, false);
246 }
247
248 static int open_terminal_as(const char *path, int flags, int nfd) {
249 int fd;
250
251 assert(path);
252 assert(nfd >= 0);
253
254 fd = open_terminal(path, flags | O_NOCTTY);
255 if (fd < 0)
256 return fd;
257
258 return move_fd(fd, nfd, false);
259 }
260
261 static int acquire_path(const char *path, int flags, mode_t mode) {
262 _cleanup_close_ int fd = -EBADF;
263 int r;
264
265 assert(path);
266
267 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
268 flags |= O_CREAT;
269
270 fd = open(path, flags|O_NOCTTY, mode);
271 if (fd >= 0)
272 return TAKE_FD(fd);
273
274 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
275 return -errno;
276
277 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
278
279 fd = socket(AF_UNIX, SOCK_STREAM, 0);
280 if (fd < 0)
281 return -errno;
282
283 r = connect_unix_path(fd, AT_FDCWD, path);
284 if (IN_SET(r, -ENOTSOCK, -EINVAL))
285 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
286 * wasn't an AF_UNIX socket after all */
287 return -ENXIO;
288 if (r < 0)
289 return r;
290
291 if ((flags & O_ACCMODE) == O_RDONLY)
292 r = shutdown(fd, SHUT_WR);
293 else if ((flags & O_ACCMODE) == O_WRONLY)
294 r = shutdown(fd, SHUT_RD);
295 else
296 r = 0;
297 if (r < 0)
298 return -errno;
299
300 return TAKE_FD(fd);
301 }
302
303 static int fixup_input(
304 const ExecContext *context,
305 int socket_fd,
306 bool apply_tty_stdin) {
307
308 ExecInput std_input;
309
310 assert(context);
311
312 std_input = context->std_input;
313
314 if (is_terminal_input(std_input) && !apply_tty_stdin)
315 return EXEC_INPUT_NULL;
316
317 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
318 return EXEC_INPUT_NULL;
319
320 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
321 return EXEC_INPUT_NULL;
322
323 return std_input;
324 }
325
326 static int fixup_output(ExecOutput output, int socket_fd) {
327
328 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
329 return EXEC_OUTPUT_INHERIT;
330
331 return output;
332 }
333
334 static int setup_input(
335 const ExecContext *context,
336 const ExecParameters *params,
337 int socket_fd,
338 const int named_iofds[static 3]) {
339
340 ExecInput i;
341 int r;
342
343 assert(context);
344 assert(params);
345 assert(named_iofds);
346
347 if (params->stdin_fd >= 0) {
348 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
349 return -errno;
350
351 /* Try to make this the controlling tty, if it is a tty, and reset it */
352 if (isatty(STDIN_FILENO)) {
353 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
354
355 if (context->tty_reset)
356 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
357
358 (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
359 }
360
361 return STDIN_FILENO;
362 }
363
364 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
365
366 switch (i) {
367
368 case EXEC_INPUT_NULL:
369 return open_null_as(O_RDONLY, STDIN_FILENO);
370
371 case EXEC_INPUT_TTY:
372 case EXEC_INPUT_TTY_FORCE:
373 case EXEC_INPUT_TTY_FAIL: {
374 _cleanup_close_ int tty_fd = -EBADF;
375 const char *tty_path;
376
377 tty_path = ASSERT_PTR(exec_context_tty_path(context));
378
379 tty_fd = acquire_terminal(tty_path,
380 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
381 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
382 ACQUIRE_TERMINAL_WAIT,
383 USEC_INFINITY);
384 if (tty_fd < 0)
385 return tty_fd;
386
387 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
388 if (r < 0)
389 return r;
390
391 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
392 if (r < 0)
393 return r;
394
395 TAKE_FD(tty_fd);
396 return r;
397 }
398
399 case EXEC_INPUT_SOCKET:
400 assert(socket_fd >= 0);
401
402 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
403
404 case EXEC_INPUT_NAMED_FD:
405 assert(named_iofds[STDIN_FILENO] >= 0);
406
407 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
408 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
409
410 case EXEC_INPUT_DATA: {
411 int fd;
412
413 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
414 if (fd < 0)
415 return fd;
416
417 return move_fd(fd, STDIN_FILENO, false);
418 }
419
420 case EXEC_INPUT_FILE: {
421 bool rw;
422 int fd;
423
424 assert(context->stdio_file[STDIN_FILENO]);
425
426 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
427 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
428
429 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
430 if (fd < 0)
431 return fd;
432
433 return move_fd(fd, STDIN_FILENO, false);
434 }
435
436 default:
437 assert_not_reached();
438 }
439 }
440
441 static bool can_inherit_stderr_from_stdout(
442 const ExecContext *context,
443 ExecOutput o,
444 ExecOutput e) {
445
446 assert(context);
447
448 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
449 * stderr fd */
450
451 if (e == EXEC_OUTPUT_INHERIT)
452 return true;
453 if (e != o)
454 return false;
455
456 if (e == EXEC_OUTPUT_NAMED_FD)
457 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
458
459 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
460 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
461
462 return true;
463 }
464
465 static int setup_output(
466 const ExecContext *context,
467 const ExecParameters *params,
468 int fileno,
469 int socket_fd,
470 const int named_iofds[static 3],
471 const char *ident,
472 uid_t uid,
473 gid_t gid,
474 dev_t *journal_stream_dev,
475 ino_t *journal_stream_ino) {
476
477 ExecOutput o;
478 ExecInput i;
479 int r;
480
481 assert(context);
482 assert(params);
483 assert(ident);
484 assert(journal_stream_dev);
485 assert(journal_stream_ino);
486
487 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
488
489 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
490 return -errno;
491
492 return STDOUT_FILENO;
493 }
494
495 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
496 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
497 return -errno;
498
499 return STDERR_FILENO;
500 }
501
502 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
503 o = fixup_output(context->std_output, socket_fd);
504
505 if (fileno == STDERR_FILENO) {
506 ExecOutput e;
507 e = fixup_output(context->std_error, socket_fd);
508
509 /* This expects the input and output are already set up */
510
511 /* Don't change the stderr file descriptor if we inherit all
512 * the way and are not on a tty */
513 if (e == EXEC_OUTPUT_INHERIT &&
514 o == EXEC_OUTPUT_INHERIT &&
515 i == EXEC_INPUT_NULL &&
516 !is_terminal_input(context->std_input) &&
517 getppid() != 1)
518 return fileno;
519
520 /* Duplicate from stdout if possible */
521 if (can_inherit_stderr_from_stdout(context, o, e))
522 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
523
524 o = e;
525
526 } else if (o == EXEC_OUTPUT_INHERIT) {
527 /* If input got downgraded, inherit the original value */
528 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
529 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
530
531 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
532 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
533 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
534
535 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
536 if (getppid() != 1)
537 return fileno;
538
539 /* We need to open /dev/null here anew, to get the right access mode. */
540 return open_null_as(O_WRONLY, fileno);
541 }
542
543 switch (o) {
544
545 case EXEC_OUTPUT_NULL:
546 return open_null_as(O_WRONLY, fileno);
547
548 case EXEC_OUTPUT_TTY:
549 if (is_terminal_input(i))
550 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
551
552 /* We don't reset the terminal if this is just about output */
553 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
554
555 case EXEC_OUTPUT_KMSG:
556 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
557 case EXEC_OUTPUT_JOURNAL:
558 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
559 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
560 if (r < 0) {
561 log_exec_warning_errno(context,
562 params,
563 r,
564 "Failed to connect %s to the journal socket, ignoring: %m",
565 fileno == STDOUT_FILENO ? "stdout" : "stderr");
566 r = open_null_as(O_WRONLY, fileno);
567 } else {
568 struct stat st;
569
570 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
571 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
572 * services to detect whether they are connected to the journal or not.
573 *
574 * If both stdout and stderr are connected to a stream then let's make sure to store the data
575 * about STDERR as that's usually the best way to do logging. */
576
577 if (fstat(fileno, &st) >= 0 &&
578 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
579 *journal_stream_dev = st.st_dev;
580 *journal_stream_ino = st.st_ino;
581 }
582 }
583 return r;
584
585 case EXEC_OUTPUT_SOCKET:
586 assert(socket_fd >= 0);
587
588 return RET_NERRNO(dup2(socket_fd, fileno));
589
590 case EXEC_OUTPUT_NAMED_FD:
591 assert(named_iofds[fileno] >= 0);
592
593 (void) fd_nonblock(named_iofds[fileno], false);
594 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
595
596 case EXEC_OUTPUT_FILE:
597 case EXEC_OUTPUT_FILE_APPEND:
598 case EXEC_OUTPUT_FILE_TRUNCATE: {
599 bool rw;
600 int fd, flags;
601
602 assert(context->stdio_file[fileno]);
603
604 rw = context->std_input == EXEC_INPUT_FILE &&
605 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
606
607 if (rw)
608 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
609
610 flags = O_WRONLY;
611 if (o == EXEC_OUTPUT_FILE_APPEND)
612 flags |= O_APPEND;
613 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
614 flags |= O_TRUNC;
615
616 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
617 if (fd < 0)
618 return fd;
619
620 return move_fd(fd, fileno, 0);
621 }
622
623 default:
624 assert_not_reached();
625 }
626 }
627
628 static int chown_terminal(int fd, uid_t uid) {
629 int r;
630
631 assert(fd >= 0);
632
633 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
634 if (!isatty_safe(fd))
635 return 0;
636
637 /* This might fail. What matters are the results. */
638 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
639 if (r < 0)
640 return r;
641
642 return 1;
643 }
644
645 static int setup_confirm_stdio(
646 const ExecContext *context,
647 const char *vc,
648 int *ret_saved_stdin,
649 int *ret_saved_stdout) {
650
651 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
652 int r;
653
654 assert(ret_saved_stdin);
655 assert(ret_saved_stdout);
656
657 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
658 if (saved_stdin < 0)
659 return -errno;
660
661 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
662 if (saved_stdout < 0)
663 return -errno;
664
665 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
666 if (fd < 0)
667 return fd;
668
669 r = chown_terminal(fd, getuid());
670 if (r < 0)
671 return r;
672
673 r = reset_terminal_fd(fd, /* switch_to_text= */ true);
674 if (r < 0)
675 return r;
676
677 r = exec_context_apply_tty_size(context, fd, vc);
678 if (r < 0)
679 return r;
680
681 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
682 TAKE_FD(fd);
683 if (r < 0)
684 return r;
685
686 *ret_saved_stdin = TAKE_FD(saved_stdin);
687 *ret_saved_stdout = TAKE_FD(saved_stdout);
688 return 0;
689 }
690
691 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
692 assert(err < 0);
693 assert(unit_id);
694
695 if (err == -ETIMEDOUT)
696 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
697 else {
698 errno = -err;
699 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
700 }
701 }
702
703 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
704 _cleanup_close_ int fd = -EBADF;
705
706 assert(vc);
707
708 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
709 if (fd < 0)
710 return;
711
712 write_confirm_error_fd(err, fd, unit_id);
713 }
714
715 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
716 int r = 0;
717
718 assert(saved_stdin);
719 assert(saved_stdout);
720
721 release_terminal();
722
723 if (*saved_stdin >= 0)
724 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
725 r = -errno;
726
727 if (*saved_stdout >= 0)
728 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
729 r = -errno;
730
731 *saved_stdin = safe_close(*saved_stdin);
732 *saved_stdout = safe_close(*saved_stdout);
733
734 return r;
735 }
736
737 enum {
738 CONFIRM_PRETEND_FAILURE = -1,
739 CONFIRM_PRETEND_SUCCESS = 0,
740 CONFIRM_EXECUTE = 1,
741 };
742
743 static bool confirm_spawn_disabled(void) {
744 return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
745 }
746
747 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
748 int saved_stdout = -1, saved_stdin = -1, r;
749 _cleanup_free_ char *e = NULL;
750 char c;
751
752 assert(context);
753 assert(params);
754
755 /* For any internal errors, assume a positive response. */
756 r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
757 if (r < 0) {
758 write_confirm_error(r, params->confirm_spawn, params->unit_id);
759 return CONFIRM_EXECUTE;
760 }
761
762 /* confirm_spawn might have been disabled while we were sleeping. */
763 if (!params->confirm_spawn || confirm_spawn_disabled()) {
764 r = 1;
765 goto restore_stdio;
766 }
767
768 e = ellipsize(cmdline, 60, 100);
769 if (!e) {
770 log_oom();
771 r = CONFIRM_EXECUTE;
772 goto restore_stdio;
773 }
774
775 for (;;) {
776 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
777 if (r < 0) {
778 write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
779 r = CONFIRM_EXECUTE;
780 goto restore_stdio;
781 }
782
783 switch (c) {
784 case 'c':
785 printf("Resuming normal execution.\n");
786 manager_disable_confirm_spawn();
787 r = 1;
788 break;
789 case 'D':
790 printf(" Unit: %s\n",
791 params->unit_id);
792 exec_context_dump(context, stdout, " ");
793 exec_params_dump(params, stdout, " ");
794 continue; /* ask again */
795 case 'f':
796 printf("Failing execution.\n");
797 r = CONFIRM_PRETEND_FAILURE;
798 break;
799 case 'h':
800 printf(" c - continue, proceed without asking anymore\n"
801 " D - dump, show the state of the unit\n"
802 " f - fail, don't execute the command and pretend it failed\n"
803 " h - help\n"
804 " i - info, show a short summary of the unit\n"
805 " j - jobs, show jobs that are in progress\n"
806 " s - skip, don't execute the command and pretend it succeeded\n"
807 " y - yes, execute the command\n");
808 continue; /* ask again */
809 case 'i':
810 printf(" Unit: %s\n"
811 " Command: %s\n",
812 params->unit_id, cmdline);
813 continue; /* ask again */
814 case 'j':
815 if (sigqueue(getppid(),
816 SIGRTMIN+18,
817 (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
818 return -errno;
819
820 continue; /* ask again */
821 case 'n':
822 /* 'n' was removed in favor of 'f'. */
823 printf("Didn't understand 'n', did you mean 'f'?\n");
824 continue; /* ask again */
825 case 's':
826 printf("Skipping execution.\n");
827 r = CONFIRM_PRETEND_SUCCESS;
828 break;
829 case 'y':
830 r = CONFIRM_EXECUTE;
831 break;
832 default:
833 assert_not_reached();
834 }
835 break;
836 }
837
838 restore_stdio:
839 restore_confirm_stdio(&saved_stdin, &saved_stdout);
840 return r;
841 }
842
843 static int get_fixed_user(
844 const char *user_or_uid,
845 const char **ret_username,
846 uid_t *ret_uid,
847 gid_t *ret_gid,
848 const char **ret_home,
849 const char **ret_shell) {
850
851 int r;
852
853 assert(user_or_uid);
854 assert(ret_username);
855
856 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
857 * (i.e. are "/" or "/bin/nologin"). */
858
859 r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
860 if (r < 0)
861 return r;
862
863 /* user_or_uid is normalized by get_user_creds to username */
864 *ret_username = user_or_uid;
865
866 return 0;
867 }
868
869 static int get_fixed_group(
870 const char *group_or_gid,
871 const char **ret_groupname,
872 gid_t *ret_gid) {
873
874 int r;
875
876 assert(group_or_gid);
877 assert(ret_groupname);
878
879 r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
880 if (r < 0)
881 return r;
882
883 /* group_or_gid is normalized by get_group_creds to groupname */
884 *ret_groupname = group_or_gid;
885
886 return 0;
887 }
888
889 static int get_supplementary_groups(const ExecContext *c, const char *user,
890 const char *group, gid_t gid,
891 gid_t **supplementary_gids, int *ngids) {
892 int r, k = 0;
893 int ngroups_max;
894 bool keep_groups = false;
895 gid_t *groups = NULL;
896 _cleanup_free_ gid_t *l_gids = NULL;
897
898 assert(c);
899
900 /*
901 * If user is given, then lookup GID and supplementary groups list.
902 * We avoid NSS lookups for gid=0. Also we have to initialize groups
903 * here and as early as possible so we keep the list of supplementary
904 * groups of the caller.
905 */
906 if (user && gid_is_valid(gid) && gid != 0) {
907 /* First step, initialize groups from /etc/groups */
908 if (initgroups(user, gid) < 0)
909 return -errno;
910
911 keep_groups = true;
912 }
913
914 if (strv_isempty(c->supplementary_groups))
915 return 0;
916
917 /*
918 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
919 * be positive, otherwise fail.
920 */
921 errno = 0;
922 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
923 if (ngroups_max <= 0)
924 return errno_or_else(EOPNOTSUPP);
925
926 l_gids = new(gid_t, ngroups_max);
927 if (!l_gids)
928 return -ENOMEM;
929
930 if (keep_groups) {
931 /*
932 * Lookup the list of groups that the user belongs to, we
933 * avoid NSS lookups here too for gid=0.
934 */
935 k = ngroups_max;
936 if (getgrouplist(user, gid, l_gids, &k) < 0)
937 return -EINVAL;
938 } else
939 k = 0;
940
941 STRV_FOREACH(i, c->supplementary_groups) {
942 const char *g;
943
944 if (k >= ngroups_max)
945 return -E2BIG;
946
947 g = *i;
948 r = get_group_creds(&g, l_gids+k, 0);
949 if (r < 0)
950 return r;
951
952 k++;
953 }
954
955 /*
956 * Sets ngids to zero to drop all supplementary groups, happens
957 * when we are under root and SupplementaryGroups= is empty.
958 */
959 if (k == 0) {
960 *ngids = 0;
961 return 0;
962 }
963
964 /* Otherwise get the final list of supplementary groups */
965 groups = memdup(l_gids, sizeof(gid_t) * k);
966 if (!groups)
967 return -ENOMEM;
968
969 *supplementary_gids = groups;
970 *ngids = k;
971
972 groups = NULL;
973
974 return 0;
975 }
976
977 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
978 int r;
979
980 /* Handle SupplementaryGroups= if it is not empty */
981 if (ngids > 0) {
982 r = maybe_setgroups(ngids, supplementary_gids);
983 if (r < 0)
984 return r;
985 }
986
987 if (gid_is_valid(gid)) {
988 /* Then set our gids */
989 if (setresgid(gid, gid, gid) < 0)
990 return -errno;
991 }
992
993 return 0;
994 }
995
996 static int set_securebits(unsigned bits, unsigned mask) {
997 unsigned applied;
998 int current;
999
1000 current = prctl(PR_GET_SECUREBITS);
1001 if (current < 0)
1002 return -errno;
1003
1004 /* Clear all securebits defined in mask and set bits */
1005 applied = ((unsigned) current & ~mask) | bits;
1006 if ((unsigned) current == applied)
1007 return 0;
1008
1009 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1010 return -errno;
1011
1012 return 1;
1013 }
1014
1015 static int enforce_user(
1016 const ExecContext *context,
1017 uid_t uid,
1018 uint64_t capability_ambient_set) {
1019 assert(context);
1020 int r;
1021
1022 if (!uid_is_valid(uid))
1023 return 0;
1024
1025 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1026 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1027 * case. */
1028
1029 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1030
1031 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1032 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1033 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1034 if (r < 0)
1035 return r;
1036 }
1037
1038 /* Second step: actually set the uids */
1039 if (setresuid(uid, uid, uid) < 0)
1040 return -errno;
1041
1042 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1043 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1044 * outside of this call. */
1045 return 0;
1046 }
1047
1048 #if HAVE_PAM
1049
1050 static int null_conv(
1051 int num_msg,
1052 const struct pam_message **msg,
1053 struct pam_response **resp,
1054 void *appdata_ptr) {
1055
1056 /* We don't support conversations */
1057
1058 return PAM_CONV_ERR;
1059 }
1060
1061 static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
1062 int r, s;
1063
1064 assert(handle);
1065
1066 r = pam_close_session(handle, flags);
1067 if (r != PAM_SUCCESS)
1068 log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
1069
1070 s = pam_setcred(handle, PAM_DELETE_CRED | flags);
1071 if (s != PAM_SUCCESS)
1072 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
1073
1074 return r != PAM_SUCCESS ? r : s;
1075 }
1076
1077 #endif
1078
1079 static int setup_pam(
1080 const char *name,
1081 const char *user,
1082 uid_t uid,
1083 gid_t gid,
1084 const char *tty,
1085 char ***env, /* updated on success */
1086 const int fds[], size_t n_fds,
1087 int exec_fd) {
1088
1089 #if HAVE_PAM
1090
1091 static const struct pam_conv conv = {
1092 .conv = null_conv,
1093 .appdata_ptr = NULL
1094 };
1095
1096 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1097 _cleanup_strv_free_ char **e = NULL;
1098 pam_handle_t *handle = NULL;
1099 sigset_t old_ss;
1100 int pam_code = PAM_SUCCESS, r;
1101 bool close_session = false;
1102 pid_t parent_pid;
1103 int flags = 0;
1104
1105 assert(name);
1106 assert(user);
1107 assert(env);
1108
1109 /* We set up PAM in the parent process, then fork. The child
1110 * will then stay around until killed via PR_GET_PDEATHSIG or
1111 * systemd via the cgroup logic. It will then remove the PAM
1112 * session again. The parent process will exec() the actual
1113 * daemon. We do things this way to ensure that the main PID
1114 * of the daemon is the one we initially fork()ed. */
1115
1116 r = barrier_create(&barrier);
1117 if (r < 0)
1118 goto fail;
1119
1120 if (log_get_max_level() < LOG_DEBUG)
1121 flags |= PAM_SILENT;
1122
1123 pam_code = pam_start(name, user, &conv, &handle);
1124 if (pam_code != PAM_SUCCESS) {
1125 handle = NULL;
1126 goto fail;
1127 }
1128
1129 if (!tty) {
1130 _cleanup_free_ char *q = NULL;
1131
1132 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1133 * out if that's the case, and read the TTY off it. */
1134
1135 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1136 tty = strjoina("/dev/", q);
1137 }
1138
1139 if (tty) {
1140 pam_code = pam_set_item(handle, PAM_TTY, tty);
1141 if (pam_code != PAM_SUCCESS)
1142 goto fail;
1143 }
1144
1145 STRV_FOREACH(nv, *env) {
1146 pam_code = pam_putenv(handle, *nv);
1147 if (pam_code != PAM_SUCCESS)
1148 goto fail;
1149 }
1150
1151 pam_code = pam_acct_mgmt(handle, flags);
1152 if (pam_code != PAM_SUCCESS)
1153 goto fail;
1154
1155 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1156 if (pam_code != PAM_SUCCESS)
1157 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
1158
1159 pam_code = pam_open_session(handle, flags);
1160 if (pam_code != PAM_SUCCESS)
1161 goto fail;
1162
1163 close_session = true;
1164
1165 e = pam_getenvlist(handle);
1166 if (!e) {
1167 pam_code = PAM_BUF_ERR;
1168 goto fail;
1169 }
1170
1171 /* Block SIGTERM, so that we know that it won't get lost in the child */
1172
1173 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
1174
1175 parent_pid = getpid_cached();
1176
1177 r = safe_fork("(sd-pam)", 0, NULL);
1178 if (r < 0)
1179 goto fail;
1180 if (r == 0) {
1181 int ret = EXIT_PAM;
1182
1183 /* The child's job is to reset the PAM session on termination */
1184 barrier_set_role(&barrier, BARRIER_CHILD);
1185
1186 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1187 * those fds are open here that have been opened by PAM. */
1188 (void) close_many(fds, n_fds);
1189
1190 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1191 * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1192 * we'd never signal completion. */
1193 exec_fd = safe_close(exec_fd);
1194
1195 /* Drop privileges - we don't need any to pam_close_session and this will make
1196 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1197 * threads to fail to exit normally */
1198
1199 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1200 if (r < 0)
1201 log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1202
1203 (void) ignore_signals(SIGPIPE);
1204
1205 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1206 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1207 * this way. We rely on the control groups kill logic to do the rest for us. */
1208 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1209 goto child_finish;
1210
1211 /* Tell the parent that our setup is done. This is especially important regarding dropping
1212 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1213 *
1214 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1215 (void) barrier_place(&barrier);
1216
1217 /* Check if our parent process might already have died? */
1218 if (getppid() == parent_pid) {
1219 sigset_t ss;
1220 int sig;
1221
1222 assert_se(sigemptyset(&ss) >= 0);
1223 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1224
1225 assert_se(sigwait(&ss, &sig) == 0);
1226 assert(sig == SIGTERM);
1227 }
1228
1229 /* If our parent died we'll end the session */
1230 if (getppid() != parent_pid) {
1231 pam_code = pam_close_session_and_delete_credentials(handle, flags);
1232 if (pam_code != PAM_SUCCESS)
1233 goto child_finish;
1234 }
1235
1236 ret = 0;
1237
1238 child_finish:
1239 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1240 * know about this. See pam_end(3) */
1241 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1242 _exit(ret);
1243 }
1244
1245 barrier_set_role(&barrier, BARRIER_PARENT);
1246
1247 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1248 * here. */
1249 handle = NULL;
1250
1251 /* Unblock SIGTERM again in the parent */
1252 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1253
1254 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1255 * this fd around. */
1256 closelog();
1257
1258 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1259 * recover. However, warn loudly if it happens. */
1260 if (!barrier_place_and_sync(&barrier))
1261 log_error("PAM initialization failed");
1262
1263 return strv_free_and_replace(*env, e);
1264
1265 fail:
1266 if (pam_code != PAM_SUCCESS) {
1267 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1268 r = -EPERM; /* PAM errors do not map to errno */
1269 } else
1270 log_error_errno(r, "PAM failed: %m");
1271
1272 if (handle) {
1273 if (close_session)
1274 pam_code = pam_close_session_and_delete_credentials(handle, flags);
1275
1276 (void) pam_end(handle, pam_code | flags);
1277 }
1278
1279 closelog();
1280 return r;
1281 #else
1282 return 0;
1283 #endif
1284 }
1285
1286 static void rename_process_from_path(const char *path) {
1287 _cleanup_free_ char *buf = NULL;
1288 const char *p;
1289
1290 assert(path);
1291
1292 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1293 * /bin/ps */
1294
1295 if (path_extract_filename(path, &buf) < 0) {
1296 rename_process("(...)");
1297 return;
1298 }
1299
1300 size_t l = strlen(buf);
1301 if (l > 8) {
1302 /* The end of the process name is usually more interesting, since the first bit might just be
1303 * "systemd-" */
1304 p = buf + l - 8;
1305 l = 8;
1306 } else
1307 p = buf;
1308
1309 char process_name[11];
1310 process_name[0] = '(';
1311 memcpy(process_name+1, p, l);
1312 process_name[1+l] = ')';
1313 process_name[1+l+1] = 0;
1314
1315 (void) rename_process(process_name);
1316 }
1317
1318 static bool context_has_address_families(const ExecContext *c) {
1319 assert(c);
1320
1321 return c->address_families_allow_list ||
1322 !set_isempty(c->address_families);
1323 }
1324
1325 static bool context_has_syscall_filters(const ExecContext *c) {
1326 assert(c);
1327
1328 return c->syscall_allow_list ||
1329 !hashmap_isempty(c->syscall_filter);
1330 }
1331
1332 static bool context_has_syscall_logs(const ExecContext *c) {
1333 assert(c);
1334
1335 return c->syscall_log_allow_list ||
1336 !hashmap_isempty(c->syscall_log);
1337 }
1338
1339 static bool context_has_seccomp(const ExecContext *c) {
1340 /* We need NNP if we have any form of seccomp and are unprivileged */
1341 return c->lock_personality ||
1342 c->memory_deny_write_execute ||
1343 c->private_devices ||
1344 c->protect_clock ||
1345 c->protect_hostname ||
1346 c->protect_kernel_tunables ||
1347 c->protect_kernel_modules ||
1348 c->protect_kernel_logs ||
1349 context_has_address_families(c) ||
1350 exec_context_restrict_namespaces_set(c) ||
1351 c->restrict_realtime ||
1352 c->restrict_suid_sgid ||
1353 !set_isempty(c->syscall_archs) ||
1354 context_has_syscall_filters(c) ||
1355 context_has_syscall_logs(c);
1356 }
1357
1358 static bool context_has_no_new_privileges(const ExecContext *c) {
1359 assert(c);
1360
1361 if (c->no_new_privileges)
1362 return true;
1363
1364 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1365 return false;
1366
1367 return context_has_seccomp(c);
1368 }
1369
1370 #if HAVE_SECCOMP
1371
1372 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1373 void *id, *val;
1374 bool has_capget = false, has_capset = false, has_prctl = false;
1375
1376 assert(c);
1377
1378 /* No syscall filter, we are allowed to drop privileges */
1379 if (hashmap_isempty(c->syscall_filter))
1380 return true;
1381
1382 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1383 _cleanup_free_ char *name = NULL;
1384
1385 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1386
1387 if (streq(name, "capget"))
1388 has_capget = true;
1389 else if (streq(name, "capset"))
1390 has_capset = true;
1391 else if (streq(name, "prctl"))
1392 has_prctl = true;
1393 }
1394
1395 if (c->syscall_allow_list)
1396 return has_capget && has_capset && has_prctl;
1397 else
1398 return !(has_capget || has_capset || has_prctl);
1399 }
1400
1401 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1402
1403 if (is_seccomp_available())
1404 return false;
1405
1406 log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1407 return true;
1408 }
1409
1410 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1411 uint32_t negative_action, default_action, action;
1412 int r;
1413
1414 assert(c);
1415 assert(p);
1416
1417 if (!context_has_syscall_filters(c))
1418 return 0;
1419
1420 if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1421 return 0;
1422
1423 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1424
1425 if (c->syscall_allow_list) {
1426 default_action = negative_action;
1427 action = SCMP_ACT_ALLOW;
1428 } else {
1429 default_action = SCMP_ACT_ALLOW;
1430 action = negative_action;
1431 }
1432
1433 if (needs_ambient_hack) {
1434 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1435 if (r < 0)
1436 return r;
1437 }
1438
1439 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1440 }
1441
1442 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1443 #ifdef SCMP_ACT_LOG
1444 uint32_t default_action, action;
1445 #endif
1446
1447 assert(c);
1448 assert(p);
1449
1450 if (!context_has_syscall_logs(c))
1451 return 0;
1452
1453 #ifdef SCMP_ACT_LOG
1454 if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1455 return 0;
1456
1457 if (c->syscall_log_allow_list) {
1458 /* Log nothing but the ones listed */
1459 default_action = SCMP_ACT_ALLOW;
1460 action = SCMP_ACT_LOG;
1461 } else {
1462 /* Log everything but the ones listed */
1463 default_action = SCMP_ACT_LOG;
1464 action = SCMP_ACT_ALLOW;
1465 }
1466
1467 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1468 #else
1469 /* old libseccomp */
1470 log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1471 return 0;
1472 #endif
1473 }
1474
1475 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1476 assert(c);
1477 assert(p);
1478
1479 if (set_isempty(c->syscall_archs))
1480 return 0;
1481
1482 if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1483 return 0;
1484
1485 return seccomp_restrict_archs(c->syscall_archs);
1486 }
1487
1488 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1489 assert(c);
1490 assert(p);
1491
1492 if (!context_has_address_families(c))
1493 return 0;
1494
1495 if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1496 return 0;
1497
1498 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1499 }
1500
1501 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1502 int r;
1503
1504 assert(c);
1505 assert(p);
1506
1507 if (!c->memory_deny_write_execute)
1508 return 0;
1509
1510 /* use prctl() if kernel supports it (6.3) */
1511 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1512 if (r == 0) {
1513 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1514 return 0;
1515 }
1516 if (r < 0 && errno != EINVAL)
1517 return log_exec_debug_errno(c,
1518 p,
1519 errno,
1520 "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1521 /* else use seccomp */
1522 log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1523
1524 if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1525 return 0;
1526
1527 return seccomp_memory_deny_write_execute();
1528 }
1529
1530 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1531 assert(c);
1532 assert(p);
1533
1534 if (!c->restrict_realtime)
1535 return 0;
1536
1537 if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1538 return 0;
1539
1540 return seccomp_restrict_realtime();
1541 }
1542
1543 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1544 assert(c);
1545 assert(p);
1546
1547 if (!c->restrict_suid_sgid)
1548 return 0;
1549
1550 if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1551 return 0;
1552
1553 return seccomp_restrict_suid_sgid();
1554 }
1555
1556 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1557 assert(c);
1558 assert(p);
1559
1560 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1561 * let's protect even those systems where this is left on in the kernel. */
1562
1563 if (!c->protect_kernel_tunables)
1564 return 0;
1565
1566 if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1567 return 0;
1568
1569 return seccomp_protect_sysctl();
1570 }
1571
1572 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1573 assert(c);
1574 assert(p);
1575
1576 /* Turn off module syscalls on ProtectKernelModules=yes */
1577
1578 if (!c->protect_kernel_modules)
1579 return 0;
1580
1581 if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1582 return 0;
1583
1584 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1585 }
1586
1587 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1588 assert(c);
1589 assert(p);
1590
1591 if (!c->protect_kernel_logs)
1592 return 0;
1593
1594 if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1595 return 0;
1596
1597 return seccomp_protect_syslog();
1598 }
1599
1600 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1601 assert(c);
1602 assert(p);
1603
1604 if (!c->protect_clock)
1605 return 0;
1606
1607 if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1608 return 0;
1609
1610 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1611 }
1612
1613 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1614 assert(c);
1615 assert(p);
1616
1617 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1618
1619 if (!c->private_devices)
1620 return 0;
1621
1622 if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1623 return 0;
1624
1625 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1626 }
1627
1628 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1629 assert(c);
1630 assert(p);
1631
1632 if (!exec_context_restrict_namespaces_set(c))
1633 return 0;
1634
1635 if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1636 return 0;
1637
1638 return seccomp_restrict_namespaces(c->restrict_namespaces);
1639 }
1640
1641 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1642 unsigned long personality;
1643 int r;
1644
1645 assert(c);
1646 assert(p);
1647
1648 if (!c->lock_personality)
1649 return 0;
1650
1651 if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1652 return 0;
1653
1654 personality = c->personality;
1655
1656 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1657 if (personality == PERSONALITY_INVALID) {
1658
1659 r = opinionated_personality(&personality);
1660 if (r < 0)
1661 return r;
1662 }
1663
1664 return seccomp_lock_personality(personality);
1665 }
1666
1667 #endif
1668
1669 #if HAVE_LIBBPF
1670 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1671 int r;
1672
1673 assert(c);
1674 assert(p);
1675
1676 if (!exec_context_restrict_filesystems_set(c))
1677 return 0;
1678
1679 if (p->bpf_restrict_fs_map_fd < 0) {
1680 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1681 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1682 return 0;
1683 }
1684
1685 /* We are in a new binary, so dl-open again */
1686 r = dlopen_bpf();
1687 if (r < 0)
1688 return r;
1689
1690 return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
1691 }
1692 #endif
1693
1694 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1695 assert(c);
1696 assert(p);
1697
1698 if (!c->protect_hostname)
1699 return 0;
1700
1701 if (ns_type_supported(NAMESPACE_UTS)) {
1702 if (unshare(CLONE_NEWUTS) < 0) {
1703 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1704 *ret_exit_status = EXIT_NAMESPACE;
1705 return log_exec_error_errno(c,
1706 p,
1707 errno,
1708 "Failed to set up UTS namespacing: %m");
1709 }
1710
1711 log_exec_warning(c,
1712 p,
1713 "ProtectHostname=yes is configured, but UTS namespace setup is "
1714 "prohibited (container manager?), ignoring namespace setup.");
1715 }
1716 } else
1717 log_exec_warning(c,
1718 p,
1719 "ProtectHostname=yes is configured, but the kernel does not "
1720 "support UTS namespaces, ignoring namespace setup.");
1721
1722 #if HAVE_SECCOMP
1723 int r;
1724
1725 if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1726 return 0;
1727
1728 r = seccomp_protect_hostname();
1729 if (r < 0) {
1730 *ret_exit_status = EXIT_SECCOMP;
1731 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1732 }
1733 #endif
1734
1735 return 0;
1736 }
1737
1738 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1739 assert(idle_pipe);
1740
1741 idle_pipe[1] = safe_close(idle_pipe[1]);
1742 idle_pipe[2] = safe_close(idle_pipe[2]);
1743
1744 if (idle_pipe[0] >= 0) {
1745 int r;
1746
1747 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1748
1749 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1750 ssize_t n;
1751
1752 /* Signal systemd that we are bored and want to continue. */
1753 n = write(idle_pipe[3], "x", 1);
1754 if (n > 0)
1755 /* Wait for systemd to react to the signal above. */
1756 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1757 }
1758
1759 idle_pipe[0] = safe_close(idle_pipe[0]);
1760
1761 }
1762
1763 idle_pipe[3] = safe_close(idle_pipe[3]);
1764 }
1765
1766 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1767
1768 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1769 * the service payload in. */
1770 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1771 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1772 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1773 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1774 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1775 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1776 };
1777
1778 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1779
1780 static int build_environment(
1781 const ExecContext *c,
1782 const ExecParameters *p,
1783 const CGroupContext *cgroup_context,
1784 size_t n_fds,
1785 const char *home,
1786 const char *username,
1787 const char *shell,
1788 dev_t journal_stream_dev,
1789 ino_t journal_stream_ino,
1790 const char *memory_pressure_path,
1791 char ***ret) {
1792
1793 _cleanup_strv_free_ char **our_env = NULL;
1794 size_t n_env = 0;
1795 char *x;
1796 int r;
1797
1798 assert(c);
1799 assert(p);
1800 assert(ret);
1801
1802 #define N_ENV_VARS 19
1803 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1804 if (!our_env)
1805 return -ENOMEM;
1806
1807 if (n_fds > 0) {
1808 _cleanup_free_ char *joined = NULL;
1809
1810 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1811 return -ENOMEM;
1812 our_env[n_env++] = x;
1813
1814 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1815 return -ENOMEM;
1816 our_env[n_env++] = x;
1817
1818 joined = strv_join(p->fd_names, ":");
1819 if (!joined)
1820 return -ENOMEM;
1821
1822 x = strjoin("LISTEN_FDNAMES=", joined);
1823 if (!x)
1824 return -ENOMEM;
1825 our_env[n_env++] = x;
1826 }
1827
1828 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1829 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1830 return -ENOMEM;
1831 our_env[n_env++] = x;
1832
1833 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1834 return -ENOMEM;
1835 our_env[n_env++] = x;
1836 }
1837
1838 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1839 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1840 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1841 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1842 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1843 if (!x)
1844 return -ENOMEM;
1845 our_env[n_env++] = x;
1846 }
1847
1848 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1849 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1850 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1851 * SetLoginEnvironment= switch. */
1852 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1853 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1854 if (r < 0)
1855 return log_exec_debug_errno(c,
1856 p,
1857 r,
1858 "Failed to determine user credentials for root: %m");
1859 }
1860
1861 bool set_user_login_env = exec_context_get_set_login_environment(c);
1862
1863 if (username) {
1864 x = strjoin("USER=", username);
1865 if (!x)
1866 return -ENOMEM;
1867 our_env[n_env++] = x;
1868
1869 if (set_user_login_env) {
1870 x = strjoin("LOGNAME=", username);
1871 if (!x)
1872 return -ENOMEM;
1873 our_env[n_env++] = x;
1874 }
1875 }
1876
1877 if (home && set_user_login_env) {
1878 x = strjoin("HOME=", home);
1879 if (!x)
1880 return -ENOMEM;
1881
1882 path_simplify(x + 5);
1883 our_env[n_env++] = x;
1884 }
1885
1886 if (shell && set_user_login_env) {
1887 x = strjoin("SHELL=", shell);
1888 if (!x)
1889 return -ENOMEM;
1890
1891 path_simplify(x + 6);
1892 our_env[n_env++] = x;
1893 }
1894
1895 if (!sd_id128_is_null(p->invocation_id)) {
1896 assert(p->invocation_id_string);
1897
1898 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1899 if (!x)
1900 return -ENOMEM;
1901
1902 our_env[n_env++] = x;
1903 }
1904
1905 if (exec_context_needs_term(c)) {
1906 _cleanup_free_ char *cmdline = NULL;
1907 const char *tty_path, *term = NULL;
1908
1909 tty_path = exec_context_tty_path(c);
1910
1911 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1912 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1913 * container manager passes to PID 1 ends up all the way in the console login shown. */
1914
1915 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1916 term = getenv("TERM");
1917 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1918 _cleanup_free_ char *key = NULL;
1919
1920 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1921 if (!key)
1922 return -ENOMEM;
1923
1924 r = proc_cmdline_get_key(key, 0, &cmdline);
1925 if (r < 0)
1926 log_exec_debug_errno(c,
1927 p,
1928 r,
1929 "Failed to read %s from kernel cmdline, ignoring: %m",
1930 key);
1931 else if (r > 0)
1932 term = cmdline;
1933 }
1934
1935 if (!term)
1936 term = default_term_for_tty(tty_path);
1937
1938 x = strjoin("TERM=", term);
1939 if (!x)
1940 return -ENOMEM;
1941 our_env[n_env++] = x;
1942 }
1943
1944 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1945 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1946 return -ENOMEM;
1947
1948 our_env[n_env++] = x;
1949 }
1950
1951 if (c->log_namespace) {
1952 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1953 if (!x)
1954 return -ENOMEM;
1955
1956 our_env[n_env++] = x;
1957 }
1958
1959 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1960 _cleanup_free_ char *joined = NULL;
1961 const char *n;
1962
1963 if (!p->prefix[t])
1964 continue;
1965
1966 if (c->directories[t].n_items == 0)
1967 continue;
1968
1969 n = exec_directory_env_name_to_string(t);
1970 if (!n)
1971 continue;
1972
1973 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1974 _cleanup_free_ char *prefixed = NULL;
1975
1976 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1977 if (!prefixed)
1978 return -ENOMEM;
1979
1980 if (!strextend_with_separator(&joined, ":", prefixed))
1981 return -ENOMEM;
1982 }
1983
1984 x = strjoin(n, "=", joined);
1985 if (!x)
1986 return -ENOMEM;
1987
1988 our_env[n_env++] = x;
1989 }
1990
1991 _cleanup_free_ char *creds_dir = NULL;
1992 r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
1993 if (r < 0)
1994 return r;
1995 if (r > 0) {
1996 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
1997 if (!x)
1998 return -ENOMEM;
1999
2000 our_env[n_env++] = x;
2001 }
2002
2003 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2004 return -ENOMEM;
2005
2006 our_env[n_env++] = x;
2007
2008 if (memory_pressure_path) {
2009 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2010 if (!x)
2011 return -ENOMEM;
2012
2013 our_env[n_env++] = x;
2014
2015 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2016 _cleanup_free_ char *b = NULL, *e = NULL;
2017
2018 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2019 MEMORY_PRESSURE_DEFAULT_TYPE,
2020 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2021 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2022 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2023 return -ENOMEM;
2024
2025 if (base64mem(b, strlen(b) + 1, &e) < 0)
2026 return -ENOMEM;
2027
2028 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2029 if (!x)
2030 return -ENOMEM;
2031
2032 our_env[n_env++] = x;
2033 }
2034 }
2035
2036 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2037 #undef N_ENV_VARS
2038
2039 *ret = TAKE_PTR(our_env);
2040
2041 return 0;
2042 }
2043
2044 static int build_pass_environment(const ExecContext *c, char ***ret) {
2045 _cleanup_strv_free_ char **pass_env = NULL;
2046 size_t n_env = 0;
2047
2048 STRV_FOREACH(i, c->pass_environment) {
2049 _cleanup_free_ char *x = NULL;
2050 char *v;
2051
2052 v = getenv(*i);
2053 if (!v)
2054 continue;
2055 x = strjoin(*i, "=", v);
2056 if (!x)
2057 return -ENOMEM;
2058
2059 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2060 return -ENOMEM;
2061
2062 pass_env[n_env++] = TAKE_PTR(x);
2063 pass_env[n_env] = NULL;
2064 }
2065
2066 *ret = TAKE_PTR(pass_env);
2067
2068 return 0;
2069 }
2070
2071 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2072 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2073 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2074 _cleanup_close_ int unshare_ready_fd = -EBADF;
2075 _cleanup_(sigkill_waitp) pid_t pid = 0;
2076 uint64_t c = 1;
2077 ssize_t n;
2078 int r;
2079
2080 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2081 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2082 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2083 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2084 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2085 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2086 * continues execution normally.
2087 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2088 * does not need CAP_SETUID to write the single line mapping to itself. */
2089
2090 /* Can only set up multiple mappings with CAP_SETUID. */
2091 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2092 r = asprintf(&uid_map,
2093 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2094 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2095 ouid, ouid, uid, uid);
2096 else
2097 r = asprintf(&uid_map,
2098 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2099 ouid, ouid);
2100
2101 if (r < 0)
2102 return -ENOMEM;
2103
2104 /* Can only set up multiple mappings with CAP_SETGID. */
2105 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2106 r = asprintf(&gid_map,
2107 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2108 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2109 ogid, ogid, gid, gid);
2110 else
2111 r = asprintf(&gid_map,
2112 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2113 ogid, ogid);
2114
2115 if (r < 0)
2116 return -ENOMEM;
2117
2118 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2119 * namespace. */
2120 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2121 if (unshare_ready_fd < 0)
2122 return -errno;
2123
2124 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2125 * failed. */
2126 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2127 return -errno;
2128
2129 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2130 if (r < 0)
2131 return r;
2132 if (r == 0) {
2133 _cleanup_close_ int fd = -EBADF;
2134 const char *a;
2135 pid_t ppid;
2136
2137 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2138 * here, after the parent opened its own user namespace. */
2139
2140 ppid = getppid();
2141 errno_pipe[0] = safe_close(errno_pipe[0]);
2142
2143 /* Wait until the parent unshared the user namespace */
2144 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2145 r = -errno;
2146 goto child_fail;
2147 }
2148
2149 /* Disable the setgroups() system call in the child user namespace, for good. */
2150 a = procfs_file_alloca(ppid, "setgroups");
2151 fd = open(a, O_WRONLY|O_CLOEXEC);
2152 if (fd < 0) {
2153 if (errno != ENOENT) {
2154 r = -errno;
2155 goto child_fail;
2156 }
2157
2158 /* If the file is missing the kernel is too old, let's continue anyway. */
2159 } else {
2160 if (write(fd, "deny\n", 5) < 0) {
2161 r = -errno;
2162 goto child_fail;
2163 }
2164
2165 fd = safe_close(fd);
2166 }
2167
2168 /* First write the GID map */
2169 a = procfs_file_alloca(ppid, "gid_map");
2170 fd = open(a, O_WRONLY|O_CLOEXEC);
2171 if (fd < 0) {
2172 r = -errno;
2173 goto child_fail;
2174 }
2175 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2176 r = -errno;
2177 goto child_fail;
2178 }
2179 fd = safe_close(fd);
2180
2181 /* The write the UID map */
2182 a = procfs_file_alloca(ppid, "uid_map");
2183 fd = open(a, O_WRONLY|O_CLOEXEC);
2184 if (fd < 0) {
2185 r = -errno;
2186 goto child_fail;
2187 }
2188 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2189 r = -errno;
2190 goto child_fail;
2191 }
2192
2193 _exit(EXIT_SUCCESS);
2194
2195 child_fail:
2196 (void) write(errno_pipe[1], &r, sizeof(r));
2197 _exit(EXIT_FAILURE);
2198 }
2199
2200 errno_pipe[1] = safe_close(errno_pipe[1]);
2201
2202 if (unshare(CLONE_NEWUSER) < 0)
2203 return -errno;
2204
2205 /* Let the child know that the namespace is ready now */
2206 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2207 return -errno;
2208
2209 /* Try to read an error code from the child */
2210 n = read(errno_pipe[0], &r, sizeof(r));
2211 if (n < 0)
2212 return -errno;
2213 if (n == sizeof(r)) { /* an error code was sent to us */
2214 if (r < 0)
2215 return r;
2216 return -EIO;
2217 }
2218 if (n != 0) /* on success we should have read 0 bytes */
2219 return -EIO;
2220
2221 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2222 if (r < 0)
2223 return r;
2224 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2225 return -EIO;
2226
2227 return 0;
2228 }
2229
2230 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2231 _cleanup_free_ char *src_abs = NULL;
2232 int r;
2233
2234 assert(source);
2235
2236 src_abs = path_join(root, source);
2237 if (!src_abs)
2238 return -ENOMEM;
2239
2240 STRV_FOREACH(dst, symlinks) {
2241 _cleanup_free_ char *dst_abs = NULL;
2242
2243 dst_abs = path_join(root, *dst);
2244 if (!dst_abs)
2245 return -ENOMEM;
2246
2247 r = mkdir_parents_label(dst_abs, 0755);
2248 if (r < 0)
2249 return r;
2250
2251 r = symlink_idempotent(src_abs, dst_abs, true);
2252 if (r < 0)
2253 return r;
2254 }
2255
2256 return 0;
2257 }
2258
2259 static int setup_exec_directory(
2260 const ExecContext *context,
2261 const ExecParameters *params,
2262 uid_t uid,
2263 gid_t gid,
2264 ExecDirectoryType type,
2265 bool needs_mount_namespace,
2266 int *exit_status) {
2267
2268 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2269 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2270 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2271 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2272 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2273 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2274 };
2275 int r;
2276
2277 assert(context);
2278 assert(params);
2279 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2280 assert(exit_status);
2281
2282 if (!params->prefix[type])
2283 return 0;
2284
2285 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2286 if (!uid_is_valid(uid))
2287 uid = 0;
2288 if (!gid_is_valid(gid))
2289 gid = 0;
2290 }
2291
2292 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2293 _cleanup_free_ char *p = NULL, *pp = NULL;
2294
2295 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2296 if (!p) {
2297 r = -ENOMEM;
2298 goto fail;
2299 }
2300
2301 r = mkdir_parents_label(p, 0755);
2302 if (r < 0)
2303 goto fail;
2304
2305 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2306
2307 /* If we are in user mode, and a configuration directory exists but a state directory
2308 * doesn't exist, then we likely are upgrading from an older systemd version that
2309 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2310 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2311 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2312 * separated. If a service has both dirs configured but only the configuration dir
2313 * exists and the state dir does not, we assume we are looking at an update
2314 * situation. Hence, create a compatibility symlink, so that all expectations are
2315 * met.
2316 *
2317 * (We also do something similar with the log directory, which still doesn't exist in
2318 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2319
2320 /* this assumes the state dir is always created before the configuration dir */
2321 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2322 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2323
2324 r = laccess(p, F_OK);
2325 if (r == -ENOENT) {
2326 _cleanup_free_ char *q = NULL;
2327
2328 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2329 * under the configuration hierarchy. */
2330
2331 if (type == EXEC_DIRECTORY_STATE)
2332 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2333 else if (type == EXEC_DIRECTORY_LOGS)
2334 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2335 else
2336 assert_not_reached();
2337 if (!q) {
2338 r = -ENOMEM;
2339 goto fail;
2340 }
2341
2342 r = laccess(q, F_OK);
2343 if (r >= 0) {
2344 /* It does exist! This hence looks like an update. Symlink the
2345 * configuration directory into the state directory. */
2346
2347 r = symlink_idempotent(q, p, /* make_relative= */ true);
2348 if (r < 0)
2349 goto fail;
2350
2351 log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2352 continue;
2353 } else if (r != -ENOENT)
2354 log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2355
2356 } else if (r < 0)
2357 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2358 }
2359
2360 if (exec_directory_is_private(context, type)) {
2361 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2362 * case we want to avoid leaving a directory around fully accessible that is owned by
2363 * a dynamic user whose UID is later on reused. To lock this down we use the same
2364 * trick used by container managers to prohibit host users to get access to files of
2365 * the same UID in containers: we place everything inside a directory that has an
2366 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2367 * for unprivileged host code. We then use fs namespacing to make this directory
2368 * permeable for the service itself.
2369 *
2370 * Specifically: for a service which wants a special directory "foo/" we first create
2371 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2372 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2373 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2374 * unprivileged host users can't look into it. Inside of the namespace of the unit
2375 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2376 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2377 * for the service and making sure it only gets access to the dirs it needs but no
2378 * others. Tricky? Yes, absolutely, but it works!
2379 *
2380 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2381 * to be owned by the service itself.
2382 *
2383 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2384 * for sharing files or sockets with other services. */
2385
2386 pp = path_join(params->prefix[type], "private");
2387 if (!pp) {
2388 r = -ENOMEM;
2389 goto fail;
2390 }
2391
2392 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2393 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2394 if (r < 0)
2395 goto fail;
2396
2397 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2398 r = -ENOMEM;
2399 goto fail;
2400 }
2401
2402 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2403 r = mkdir_parents_label(pp, 0755);
2404 if (r < 0)
2405 goto fail;
2406
2407 if (is_dir(p, false) > 0 &&
2408 (laccess(pp, F_OK) == -ENOENT)) {
2409
2410 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2411 * it over. Most likely the service has been upgraded from one that didn't use
2412 * DynamicUser=1, to one that does. */
2413
2414 log_exec_info(context,
2415 params,
2416 "Found pre-existing public %s= directory %s, migrating to %s.\n"
2417 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2418 exec_directory_type_to_string(type), p, pp);
2419
2420 r = RET_NERRNO(rename(p, pp));
2421 if (r < 0)
2422 goto fail;
2423 } else {
2424 /* Otherwise, create the actual directory for the service */
2425
2426 r = mkdir_label(pp, context->directories[type].mode);
2427 if (r < 0 && r != -EEXIST)
2428 goto fail;
2429 }
2430
2431 if (!context->directories[type].items[i].only_create) {
2432 /* And link it up from the original place.
2433 * Notes
2434 * 1) If a mount namespace is going to be used, then this symlink remains on
2435 * the host, and a new one for the child namespace will be created later.
2436 * 2) It is not necessary to create this symlink when one of its parent
2437 * directories is specified and already created. E.g.
2438 * StateDirectory=foo foo/bar
2439 * In that case, the inode points to pp and p for "foo/bar" are the same:
2440 * pp = "/var/lib/private/foo/bar"
2441 * p = "/var/lib/foo/bar"
2442 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2443 * we do not need to create the symlink, but we cannot create the symlink.
2444 * See issue #24783. */
2445 r = symlink_idempotent(pp, p, true);
2446 if (r < 0)
2447 goto fail;
2448 }
2449
2450 } else {
2451 _cleanup_free_ char *target = NULL;
2452
2453 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2454 readlink_and_make_absolute(p, &target) >= 0) {
2455 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2456
2457 /* This already exists and is a symlink? Interesting. Maybe it's one created
2458 * by DynamicUser=1 (see above)?
2459 *
2460 * We do this for all directory types except for ConfigurationDirectory=,
2461 * since they all support the private/ symlink logic at least in some
2462 * configurations, see above. */
2463
2464 r = chase(target, NULL, 0, &target_resolved, NULL);
2465 if (r < 0)
2466 goto fail;
2467
2468 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2469 if (!q) {
2470 r = -ENOMEM;
2471 goto fail;
2472 }
2473
2474 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2475 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2476 if (r < 0)
2477 goto fail;
2478
2479 if (path_equal(q_resolved, target_resolved)) {
2480
2481 /* Hmm, apparently DynamicUser= was once turned on for this service,
2482 * but is no longer. Let's move the directory back up. */
2483
2484 log_exec_info(context,
2485 params,
2486 "Found pre-existing private %s= directory %s, migrating to %s.\n"
2487 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2488 exec_directory_type_to_string(type), q, p);
2489
2490 r = RET_NERRNO(unlink(p));
2491 if (r < 0)
2492 goto fail;
2493
2494 r = RET_NERRNO(rename(q, p));
2495 if (r < 0)
2496 goto fail;
2497 }
2498 }
2499
2500 r = mkdir_label(p, context->directories[type].mode);
2501 if (r < 0) {
2502 if (r != -EEXIST)
2503 goto fail;
2504
2505 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2506 struct stat st;
2507
2508 /* Don't change the owner/access mode of the configuration directory,
2509 * as in the common case it is not written to by a service, and shall
2510 * not be writable. */
2511
2512 r = RET_NERRNO(stat(p, &st));
2513 if (r < 0)
2514 goto fail;
2515
2516 /* Still complain if the access mode doesn't match */
2517 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2518 log_exec_warning(context,
2519 params,
2520 "%s \'%s\' already exists but the mode is different. "
2521 "(File system: %o %sMode: %o)",
2522 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2523 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2524
2525 continue;
2526 }
2527 }
2528 }
2529
2530 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2531 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2532 * current UID/GID ownership.) */
2533 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2534 if (r < 0)
2535 goto fail;
2536
2537 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2538 * available to user code anyway */
2539 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2540 continue;
2541
2542 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2543 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2544 * assignments to exist. */
2545 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2546 if (r < 0)
2547 goto fail;
2548 }
2549
2550 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2551 * they are set up later, to allow configuring empty var/run/etc. */
2552 if (!needs_mount_namespace)
2553 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2554 r = create_many_symlinks(params->prefix[type],
2555 context->directories[type].items[i].path,
2556 context->directories[type].items[i].symlinks);
2557 if (r < 0)
2558 goto fail;
2559 }
2560
2561 return 0;
2562
2563 fail:
2564 *exit_status = exit_status_table[type];
2565 return r;
2566 }
2567
2568 #if ENABLE_SMACK
2569 static int setup_smack(
2570 const ExecParameters *params,
2571 const ExecContext *context,
2572 int executable_fd) {
2573 int r;
2574
2575 assert(params);
2576 assert(executable_fd >= 0);
2577
2578 if (context->smack_process_label) {
2579 r = mac_smack_apply_pid(0, context->smack_process_label);
2580 if (r < 0)
2581 return r;
2582 } else if (params->fallback_smack_process_label) {
2583 _cleanup_free_ char *exec_label = NULL;
2584
2585 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2586 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2587 return r;
2588
2589 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2590 if (r < 0)
2591 return r;
2592 }
2593
2594 return 0;
2595 }
2596 #endif
2597
2598 static int compile_bind_mounts(
2599 const ExecContext *context,
2600 const ExecParameters *params,
2601 BindMount **ret_bind_mounts,
2602 size_t *ret_n_bind_mounts,
2603 char ***ret_empty_directories) {
2604
2605 _cleanup_strv_free_ char **empty_directories = NULL;
2606 BindMount *bind_mounts = NULL;
2607 size_t n, h = 0;
2608 int r;
2609
2610 assert(context);
2611 assert(params);
2612 assert(ret_bind_mounts);
2613 assert(ret_n_bind_mounts);
2614 assert(ret_empty_directories);
2615
2616 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2617
2618 n = context->n_bind_mounts;
2619 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2620 if (!params->prefix[t])
2621 continue;
2622
2623 for (size_t i = 0; i < context->directories[t].n_items; i++)
2624 n += !context->directories[t].items[i].only_create;
2625 }
2626
2627 if (n <= 0) {
2628 *ret_bind_mounts = NULL;
2629 *ret_n_bind_mounts = 0;
2630 *ret_empty_directories = NULL;
2631 return 0;
2632 }
2633
2634 bind_mounts = new(BindMount, n);
2635 if (!bind_mounts)
2636 return -ENOMEM;
2637
2638 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2639 BindMount *item = context->bind_mounts + i;
2640 _cleanup_free_ char *s = NULL, *d = NULL;
2641
2642 s = strdup(item->source);
2643 if (!s)
2644 return -ENOMEM;
2645
2646 d = strdup(item->destination);
2647 if (!d)
2648 return -ENOMEM;
2649
2650 bind_mounts[h++] = (BindMount) {
2651 .source = TAKE_PTR(s),
2652 .destination = TAKE_PTR(d),
2653 .read_only = item->read_only,
2654 .recursive = item->recursive,
2655 .ignore_enoent = item->ignore_enoent,
2656 };
2657 }
2658
2659 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2660 if (!params->prefix[t])
2661 continue;
2662
2663 if (context->directories[t].n_items == 0)
2664 continue;
2665
2666 if (exec_directory_is_private(context, t) &&
2667 !exec_context_with_rootfs(context)) {
2668 char *private_root;
2669
2670 /* So this is for a dynamic user, and we need to make sure the process can access its own
2671 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2672 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2673
2674 private_root = path_join(params->prefix[t], "private");
2675 if (!private_root)
2676 return -ENOMEM;
2677
2678 r = strv_consume(&empty_directories, private_root);
2679 if (r < 0)
2680 return r;
2681 }
2682
2683 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2684 _cleanup_free_ char *s = NULL, *d = NULL;
2685
2686 /* When one of the parent directories is in the list, we cannot create the symlink
2687 * for the child directory. See also the comments in setup_exec_directory(). */
2688 if (context->directories[t].items[i].only_create)
2689 continue;
2690
2691 if (exec_directory_is_private(context, t))
2692 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2693 else
2694 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2695 if (!s)
2696 return -ENOMEM;
2697
2698 if (exec_directory_is_private(context, t) &&
2699 exec_context_with_rootfs(context))
2700 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2701 * directory is not created on the root directory. So, let's bind-mount the directory
2702 * on the 'non-private' place. */
2703 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2704 else
2705 d = strdup(s);
2706 if (!d)
2707 return -ENOMEM;
2708
2709 bind_mounts[h++] = (BindMount) {
2710 .source = TAKE_PTR(s),
2711 .destination = TAKE_PTR(d),
2712 .read_only = false,
2713 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2714 .recursive = true,
2715 .ignore_enoent = false,
2716 };
2717 }
2718 }
2719
2720 assert(h == n);
2721
2722 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2723 *ret_n_bind_mounts = n;
2724 *ret_empty_directories = TAKE_PTR(empty_directories);
2725
2726 return (int) n;
2727 }
2728
2729 /* ret_symlinks will contain a list of pairs src:dest that describes
2730 * the symlinks to create later on. For example, the symlinks needed
2731 * to safely give private directories to DynamicUser=1 users. */
2732 static int compile_symlinks(
2733 const ExecContext *context,
2734 const ExecParameters *params,
2735 bool setup_os_release_symlink,
2736 char ***ret_symlinks) {
2737
2738 _cleanup_strv_free_ char **symlinks = NULL;
2739 int r;
2740
2741 assert(context);
2742 assert(params);
2743 assert(ret_symlinks);
2744
2745 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2746 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2747 _cleanup_free_ char *private_path = NULL, *path = NULL;
2748
2749 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2750 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2751
2752 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2753 dst_abs = path_join(params->prefix[dt], *symlink);
2754 if (!src_abs || !dst_abs)
2755 return -ENOMEM;
2756
2757 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2758 if (r < 0)
2759 return r;
2760 }
2761
2762 if (!exec_directory_is_private(context, dt) ||
2763 exec_context_with_rootfs(context) ||
2764 context->directories[dt].items[i].only_create)
2765 continue;
2766
2767 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2768 if (!private_path)
2769 return -ENOMEM;
2770
2771 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2772 if (!path)
2773 return -ENOMEM;
2774
2775 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2776 if (r < 0)
2777 return r;
2778 }
2779 }
2780
2781 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2782 * and readers will never get a half-written version. Note that, while the paths specified here are
2783 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2784 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2785 if (setup_os_release_symlink) {
2786 r = strv_extend_many(
2787 &symlinks,
2788 "/run/host/.os-release-stage/os-release",
2789 "/run/host/os-release");
2790 if (r < 0)
2791 return r;
2792 }
2793
2794 *ret_symlinks = TAKE_PTR(symlinks);
2795
2796 return 0;
2797 }
2798
2799 static bool insist_on_sandboxing(
2800 const ExecContext *context,
2801 const char *root_dir,
2802 const char *root_image,
2803 const BindMount *bind_mounts,
2804 size_t n_bind_mounts) {
2805
2806 assert(context);
2807 assert(n_bind_mounts == 0 || bind_mounts);
2808
2809 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2810 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2811 * rearrange stuff in a way we cannot ignore gracefully. */
2812
2813 if (context->n_temporary_filesystems > 0)
2814 return true;
2815
2816 if (root_dir || root_image)
2817 return true;
2818
2819 if (context->n_mount_images > 0)
2820 return true;
2821
2822 if (context->dynamic_user)
2823 return true;
2824
2825 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2826 return true;
2827
2828 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2829 * essential. */
2830 for (size_t i = 0; i < n_bind_mounts; i++)
2831 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2832 return true;
2833
2834 if (context->log_namespace)
2835 return true;
2836
2837 return false;
2838 }
2839
2840 static int setup_ephemeral(
2841 const ExecContext *context,
2842 ExecRuntime *runtime,
2843 char **root_image, /* both input and output! modified if ephemeral logic enabled */
2844 char **root_directory) { /* ditto */
2845
2846 _cleanup_close_ int fd = -EBADF;
2847 _cleanup_free_ char *new_root = NULL;
2848 int r;
2849
2850 assert(context);
2851 assert(root_image);
2852 assert(root_directory);
2853
2854 if (!*root_image && !*root_directory)
2855 return 0;
2856
2857 if (!runtime || !runtime->ephemeral_copy)
2858 return 0;
2859
2860 assert(runtime->ephemeral_storage_socket[0] >= 0);
2861 assert(runtime->ephemeral_storage_socket[1] >= 0);
2862
2863 new_root = strdup(runtime->ephemeral_copy);
2864 if (!new_root)
2865 return log_oom_debug();
2866
2867 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2868 if (r < 0)
2869 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2870
2871 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2872
2873 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2874 if (fd >= 0)
2875 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2876 return 0;
2877 if (fd != -EAGAIN)
2878 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2879
2880 if (*root_image) {
2881 log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
2882
2883 fd = copy_file(*root_image,
2884 new_root,
2885 O_EXCL,
2886 0600,
2887 COPY_LOCK_BSD|
2888 COPY_REFLINK|
2889 COPY_CRTIME);
2890 if (fd < 0)
2891 return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
2892 *root_image, new_root);
2893
2894 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2895 * which tends to not perform well in combination with lots of random writes.
2896 *
2897 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2898 * copy, but we at least want to make the intention clear.
2899 */
2900 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2901 if (r < 0)
2902 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", new_root);
2903 } else {
2904 assert(*root_directory);
2905
2906 log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
2907
2908 fd = btrfs_subvol_snapshot_at(
2909 AT_FDCWD, *root_directory,
2910 AT_FDCWD, new_root,
2911 BTRFS_SNAPSHOT_FALLBACK_COPY |
2912 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2913 BTRFS_SNAPSHOT_RECURSIVE |
2914 BTRFS_SNAPSHOT_LOCK_BSD);
2915 if (fd < 0)
2916 return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
2917 *root_directory, new_root);
2918 }
2919
2920 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2921 if (r < 0)
2922 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2923
2924 if (*root_image)
2925 free_and_replace(*root_image, new_root);
2926 else {
2927 assert(*root_directory);
2928 free_and_replace(*root_directory, new_root);
2929 }
2930
2931 return 1;
2932 }
2933
2934 static int verity_settings_prepare(
2935 VeritySettings *verity,
2936 const char *root_image,
2937 const void *root_hash,
2938 size_t root_hash_size,
2939 const char *root_hash_path,
2940 const void *root_hash_sig,
2941 size_t root_hash_sig_size,
2942 const char *root_hash_sig_path,
2943 const char *verity_data_path) {
2944
2945 int r;
2946
2947 assert(verity);
2948
2949 if (root_hash) {
2950 void *d;
2951
2952 d = memdup(root_hash, root_hash_size);
2953 if (!d)
2954 return -ENOMEM;
2955
2956 free_and_replace(verity->root_hash, d);
2957 verity->root_hash_size = root_hash_size;
2958 verity->designator = PARTITION_ROOT;
2959 }
2960
2961 if (root_hash_sig) {
2962 void *d;
2963
2964 d = memdup(root_hash_sig, root_hash_sig_size);
2965 if (!d)
2966 return -ENOMEM;
2967
2968 free_and_replace(verity->root_hash_sig, d);
2969 verity->root_hash_sig_size = root_hash_sig_size;
2970 verity->designator = PARTITION_ROOT;
2971 }
2972
2973 if (verity_data_path) {
2974 r = free_and_strdup(&verity->data_path, verity_data_path);
2975 if (r < 0)
2976 return r;
2977 }
2978
2979 r = verity_settings_load(
2980 verity,
2981 root_image,
2982 root_hash_path,
2983 root_hash_sig_path);
2984 if (r < 0)
2985 return log_debug_errno(r, "Failed to load root hash: %m");
2986
2987 return 0;
2988 }
2989
2990 static int pick_versions(
2991 const ExecContext *context,
2992 const ExecParameters *params,
2993 char **ret_root_image,
2994 char **ret_root_directory) {
2995
2996 int r;
2997
2998 assert(context);
2999 assert(params);
3000 assert(ret_root_image);
3001 assert(ret_root_directory);
3002
3003 if (context->root_image) {
3004 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3005
3006 r = path_pick(/* toplevel_path= */ NULL,
3007 /* toplevel_fd= */ AT_FDCWD,
3008 context->root_image,
3009 &pick_filter_image_raw,
3010 PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3011 &result);
3012 if (r < 0)
3013 return r;
3014
3015 if (!result.path)
3016 return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
3017
3018 *ret_root_image = TAKE_PTR(result.path);
3019 *ret_root_directory = NULL;
3020 return r;
3021 }
3022
3023 if (context->root_directory) {
3024 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3025
3026 r = path_pick(/* toplevel_path= */ NULL,
3027 /* toplevel_fd= */ AT_FDCWD,
3028 context->root_directory,
3029 &pick_filter_image_dir,
3030 PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3031 &result);
3032 if (r < 0)
3033 return r;
3034
3035 if (!result.path)
3036 return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
3037
3038 *ret_root_image = NULL;
3039 *ret_root_directory = TAKE_PTR(result.path);
3040 return r;
3041 }
3042
3043 *ret_root_image = *ret_root_directory = NULL;
3044 return 0;
3045 }
3046
3047 static int apply_mount_namespace(
3048 ExecCommandFlags command_flags,
3049 const ExecContext *context,
3050 const ExecParameters *params,
3051 ExecRuntime *runtime,
3052 const char *memory_pressure_path,
3053 bool needs_sandboxing,
3054 char **error_path) {
3055
3056 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3057 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3058 **read_write_paths_cleanup = NULL;
3059 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3060 *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
3061 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3062 char **read_write_paths;
3063 bool setup_os_release_symlink;
3064 BindMount *bind_mounts = NULL;
3065 size_t n_bind_mounts = 0;
3066 int r;
3067
3068 assert(context);
3069
3070 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3071
3072 if (params->flags & EXEC_APPLY_CHROOT) {
3073 r = pick_versions(
3074 context,
3075 params,
3076 &root_image,
3077 &root_dir);
3078 if (r < 0)
3079 return r;
3080
3081 r = setup_ephemeral(
3082 context,
3083 runtime,
3084 &root_image,
3085 &root_dir);
3086 if (r < 0)
3087 return r;
3088 }
3089
3090 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3091 if (r < 0)
3092 return r;
3093
3094 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3095 * service will need to write to it in order to start the notifications. */
3096 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3097 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3098 if (!read_write_paths_cleanup)
3099 return -ENOMEM;
3100
3101 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3102 if (r < 0)
3103 return r;
3104
3105 read_write_paths = read_write_paths_cleanup;
3106 } else
3107 read_write_paths = context->read_write_paths;
3108
3109 if (needs_sandboxing) {
3110 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3111 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3112 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3113
3114 if (context->private_tmp && runtime && runtime->shared) {
3115 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3116 tmp_dir = runtime->shared->tmp_dir;
3117 else if (runtime->shared->tmp_dir)
3118 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3119
3120 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3121 var_tmp_dir = runtime->shared->var_tmp_dir;
3122 else if (runtime->shared->var_tmp_dir)
3123 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3124 }
3125 }
3126
3127 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3128 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3129 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3130 if (r < 0)
3131 return r;
3132
3133 if (context->mount_propagation_flag == MS_SHARED)
3134 log_exec_debug(context,
3135 params,
3136 "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3137
3138 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3139 if (r < 0)
3140 return r;
3141
3142 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3143 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3144 if (!propagate_dir)
3145 return -ENOMEM;
3146
3147 incoming_dir = strdup("/run/systemd/incoming");
3148 if (!incoming_dir)
3149 return -ENOMEM;
3150
3151 extension_dir = strdup("/run/systemd/unit-extensions");
3152 if (!extension_dir)
3153 return -ENOMEM;
3154
3155 /* If running under a different root filesystem, propagate the host's os-release. We make a
3156 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3157 if (setup_os_release_symlink) {
3158 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3159 if (!host_os_release_stage)
3160 return -ENOMEM;
3161 }
3162 } else {
3163 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3164
3165 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3166 return -ENOMEM;
3167
3168 if (setup_os_release_symlink) {
3169 if (asprintf(&host_os_release_stage,
3170 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3171 geteuid()) < 0)
3172 return -ENOMEM;
3173 }
3174 }
3175
3176 if (root_image) {
3177 r = verity_settings_prepare(
3178 &verity,
3179 root_image,
3180 context->root_hash, context->root_hash_size, context->root_hash_path,
3181 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3182 context->root_verity);
3183 if (r < 0)
3184 return r;
3185 }
3186
3187 NamespaceParameters parameters = {
3188 .runtime_scope = params->runtime_scope,
3189
3190 .root_directory = root_dir,
3191 .root_image = root_image,
3192 .root_image_options = context->root_image_options,
3193 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3194
3195 .read_write_paths = read_write_paths,
3196 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3197 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3198
3199 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3200 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3201
3202 .empty_directories = empty_directories,
3203 .symlinks = symlinks,
3204
3205 .bind_mounts = bind_mounts,
3206 .n_bind_mounts = n_bind_mounts,
3207
3208 .temporary_filesystems = context->temporary_filesystems,
3209 .n_temporary_filesystems = context->n_temporary_filesystems,
3210
3211 .mount_images = context->mount_images,
3212 .n_mount_images = context->n_mount_images,
3213 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3214
3215 .tmp_dir = tmp_dir,
3216 .var_tmp_dir = var_tmp_dir,
3217
3218 .creds_path = creds_path,
3219 .log_namespace = context->log_namespace,
3220 .mount_propagation_flag = context->mount_propagation_flag,
3221
3222 .verity = &verity,
3223
3224 .extension_images = context->extension_images,
3225 .n_extension_images = context->n_extension_images,
3226 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3227 .extension_directories = context->extension_directories,
3228
3229 .propagate_dir = propagate_dir,
3230 .incoming_dir = incoming_dir,
3231 .extension_dir = extension_dir,
3232 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3233 .host_os_release_stage = host_os_release_stage,
3234
3235 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3236 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3237 * sandbox inside the mount namespace. */
3238 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3239
3240 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3241 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3242 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3243 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3244 .protect_hostname = needs_sandboxing && context->protect_hostname,
3245
3246 .private_dev = needs_sandboxing && context->private_devices,
3247 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3248 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3249
3250 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3251
3252 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3253 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3254
3255 .protect_home = needs_sandboxing ? context->protect_home : false,
3256 .protect_system = needs_sandboxing ? context->protect_system : false,
3257 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3258 .proc_subset = needs_sandboxing ? context->proc_subset : false,
3259 };
3260
3261 r = setup_namespace(&parameters, error_path);
3262 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3263 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3264 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3265 * completely different execution environment. */
3266 if (r == -ENOANO) {
3267 if (insist_on_sandboxing(
3268 context,
3269 root_dir, root_image,
3270 bind_mounts,
3271 n_bind_mounts))
3272 return log_exec_debug_errno(context,
3273 params,
3274 SYNTHETIC_ERRNO(EOPNOTSUPP),
3275 "Failed to set up namespace, and refusing to continue since "
3276 "the selected namespacing options alter mount environment non-trivially.\n"
3277 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3278 n_bind_mounts,
3279 context->n_temporary_filesystems,
3280 yes_no(root_dir),
3281 yes_no(root_image),
3282 yes_no(context->dynamic_user));
3283
3284 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3285 return 0;
3286 }
3287
3288 return r;
3289 }
3290
3291 static int apply_working_directory(
3292 const ExecContext *context,
3293 const ExecParameters *params,
3294 ExecRuntime *runtime,
3295 const char *home,
3296 int *exit_status) {
3297
3298 const char *wd;
3299 int r;
3300
3301 assert(context);
3302 assert(exit_status);
3303
3304 if (context->working_directory_home) {
3305 if (!home) {
3306 *exit_status = EXIT_CHDIR;
3307 return -ENXIO;
3308 }
3309
3310 wd = home;
3311 } else
3312 wd = empty_to_root(context->working_directory);
3313
3314 if (params->flags & EXEC_APPLY_CHROOT)
3315 r = RET_NERRNO(chdir(wd));
3316 else {
3317 _cleanup_close_ int dfd = -EBADF;
3318
3319 r = chase(wd,
3320 (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory,
3321 CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3322 /* ret_path= */ NULL,
3323 &dfd);
3324 if (r >= 0)
3325 r = RET_NERRNO(fchdir(dfd));
3326 }
3327
3328 if (r < 0 && !context->working_directory_missing_ok) {
3329 *exit_status = EXIT_CHDIR;
3330 return r;
3331 }
3332
3333 return 0;
3334 }
3335
3336 static int apply_root_directory(
3337 const ExecContext *context,
3338 const ExecParameters *params,
3339 ExecRuntime *runtime,
3340 const bool needs_mount_ns,
3341 int *exit_status) {
3342
3343 assert(context);
3344 assert(exit_status);
3345
3346 if (params->flags & EXEC_APPLY_CHROOT)
3347 if (!needs_mount_ns && context->root_directory)
3348 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3349 *exit_status = EXIT_CHROOT;
3350 return -errno;
3351 }
3352
3353 return 0;
3354 }
3355
3356 static int setup_keyring(
3357 const ExecContext *context,
3358 const ExecParameters *p,
3359 uid_t uid, gid_t gid) {
3360
3361 key_serial_t keyring;
3362 int r = 0;
3363 uid_t saved_uid;
3364 gid_t saved_gid;
3365
3366 assert(context);
3367 assert(p);
3368
3369 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3370 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3371 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3372 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3373 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3374 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3375
3376 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3377 return 0;
3378
3379 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3380 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3381 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3382 * & group is just as nasty as acquiring a reference to the user keyring. */
3383
3384 saved_uid = getuid();
3385 saved_gid = getgid();
3386
3387 if (gid_is_valid(gid) && gid != saved_gid) {
3388 if (setregid(gid, -1) < 0)
3389 return log_exec_error_errno(context,
3390 p,
3391 errno,
3392 "Failed to change GID for user keyring: %m");
3393 }
3394
3395 if (uid_is_valid(uid) && uid != saved_uid) {
3396 if (setreuid(uid, -1) < 0) {
3397 r = log_exec_error_errno(context,
3398 p,
3399 errno,
3400 "Failed to change UID for user keyring: %m");
3401 goto out;
3402 }
3403 }
3404
3405 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3406 if (keyring == -1) {
3407 if (errno == ENOSYS)
3408 log_exec_debug_errno(context,
3409 p,
3410 errno,
3411 "Kernel keyring not supported, ignoring.");
3412 else if (ERRNO_IS_PRIVILEGE(errno))
3413 log_exec_debug_errno(context,
3414 p,
3415 errno,
3416 "Kernel keyring access prohibited, ignoring.");
3417 else if (errno == EDQUOT)
3418 log_exec_debug_errno(context,
3419 p,
3420 errno,
3421 "Out of kernel keyrings to allocate, ignoring.");
3422 else
3423 r = log_exec_error_errno(context,
3424 p,
3425 errno,
3426 "Setting up kernel keyring failed: %m");
3427
3428 goto out;
3429 }
3430
3431 /* When requested link the user keyring into the session keyring. */
3432 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3433
3434 if (keyctl(KEYCTL_LINK,
3435 KEY_SPEC_USER_KEYRING,
3436 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3437 r = log_exec_error_errno(context,
3438 p,
3439 errno,
3440 "Failed to link user keyring into session keyring: %m");
3441 goto out;
3442 }
3443 }
3444
3445 /* Restore uid/gid back */
3446 if (uid_is_valid(uid) && uid != saved_uid) {
3447 if (setreuid(saved_uid, -1) < 0) {
3448 r = log_exec_error_errno(context,
3449 p,
3450 errno,
3451 "Failed to change UID back for user keyring: %m");
3452 goto out;
3453 }
3454 }
3455
3456 if (gid_is_valid(gid) && gid != saved_gid) {
3457 if (setregid(saved_gid, -1) < 0)
3458 return log_exec_error_errno(context,
3459 p,
3460 errno,
3461 "Failed to change GID back for user keyring: %m");
3462 }
3463
3464 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3465 if (!sd_id128_is_null(p->invocation_id)) {
3466 key_serial_t key;
3467
3468 key = add_key("user",
3469 "invocation_id",
3470 &p->invocation_id,
3471 sizeof(p->invocation_id),
3472 KEY_SPEC_SESSION_KEYRING);
3473 if (key == -1)
3474 log_exec_debug_errno(context,
3475 p,
3476 errno,
3477 "Failed to add invocation ID to keyring, ignoring: %m");
3478 else {
3479 if (keyctl(KEYCTL_SETPERM, key,
3480 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3481 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3482 r = log_exec_error_errno(context,
3483 p,
3484 errno,
3485 "Failed to restrict invocation ID permission: %m");
3486 }
3487 }
3488
3489 out:
3490 /* Revert back uid & gid for the last time, and exit */
3491 /* no extra logging, as only the first already reported error matters */
3492 if (getuid() != saved_uid)
3493 (void) setreuid(saved_uid, -1);
3494
3495 if (getgid() != saved_gid)
3496 (void) setregid(saved_gid, -1);
3497
3498 return r;
3499 }
3500
3501 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3502 assert(array);
3503 assert(n);
3504 assert(pair);
3505
3506 if (pair[0] >= 0)
3507 array[(*n)++] = pair[0];
3508 if (pair[1] >= 0)
3509 array[(*n)++] = pair[1];
3510 }
3511
3512 static int close_remaining_fds(
3513 const ExecParameters *params,
3514 const ExecRuntime *runtime,
3515 int socket_fd,
3516 const int *fds, size_t n_fds) {
3517
3518 size_t n_dont_close = 0;
3519 int dont_close[n_fds + 14];
3520
3521 assert(params);
3522
3523 if (params->stdin_fd >= 0)
3524 dont_close[n_dont_close++] = params->stdin_fd;
3525 if (params->stdout_fd >= 0)
3526 dont_close[n_dont_close++] = params->stdout_fd;
3527 if (params->stderr_fd >= 0)
3528 dont_close[n_dont_close++] = params->stderr_fd;
3529
3530 if (socket_fd >= 0)
3531 dont_close[n_dont_close++] = socket_fd;
3532 if (n_fds > 0) {
3533 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3534 n_dont_close += n_fds;
3535 }
3536
3537 if (runtime)
3538 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3539
3540 if (runtime && runtime->shared) {
3541 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3542 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3543 }
3544
3545 if (runtime && runtime->dynamic_creds) {
3546 if (runtime->dynamic_creds->user)
3547 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3548 if (runtime->dynamic_creds->group)
3549 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3550 }
3551
3552 if (params->user_lookup_fd >= 0)
3553 dont_close[n_dont_close++] = params->user_lookup_fd;
3554
3555 return close_all_fds(dont_close, n_dont_close);
3556 }
3557
3558 static int send_user_lookup(
3559 const char *unit_id,
3560 int user_lookup_fd,
3561 uid_t uid,
3562 gid_t gid) {
3563
3564 assert(unit_id);
3565
3566 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3567 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3568 * specified. */
3569
3570 if (user_lookup_fd < 0)
3571 return 0;
3572
3573 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3574 return 0;
3575
3576 if (writev(user_lookup_fd,
3577 (struct iovec[]) {
3578 IOVEC_MAKE(&uid, sizeof(uid)),
3579 IOVEC_MAKE(&gid, sizeof(gid)),
3580 IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3581 return -errno;
3582
3583 return 0;
3584 }
3585
3586 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3587 int r;
3588
3589 assert(c);
3590 assert(home);
3591 assert(buf);
3592
3593 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3594
3595 if (*home)
3596 return 0;
3597
3598 if (!c->working_directory_home)
3599 return 0;
3600
3601 r = get_home_dir(buf);
3602 if (r < 0)
3603 return r;
3604
3605 *home = *buf;
3606 return 1;
3607 }
3608
3609 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3610 _cleanup_strv_free_ char ** list = NULL;
3611 int r;
3612
3613 assert(c);
3614 assert(p);
3615 assert(ret);
3616
3617 assert(c->dynamic_user);
3618
3619 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3620 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3621 * directories. */
3622
3623 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3624 if (t == EXEC_DIRECTORY_CONFIGURATION)
3625 continue;
3626
3627 if (!p->prefix[t])
3628 continue;
3629
3630 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3631 char *e;
3632
3633 if (exec_directory_is_private(c, t))
3634 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3635 else
3636 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3637 if (!e)
3638 return -ENOMEM;
3639
3640 r = strv_consume(&list, e);
3641 if (r < 0)
3642 return r;
3643 }
3644 }
3645
3646 *ret = TAKE_PTR(list);
3647
3648 return 0;
3649 }
3650
3651 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3652 _cleanup_(cpu_set_reset) CPUSet s = {};
3653 int r;
3654
3655 assert(c);
3656 assert(ret);
3657
3658 if (!c->numa_policy.nodes.set) {
3659 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3660 return 0;
3661 }
3662
3663 r = numa_to_cpu_set(&c->numa_policy, &s);
3664 if (r < 0)
3665 return r;
3666
3667 cpu_set_reset(ret);
3668
3669 return cpu_set_add_all(ret, &s);
3670 }
3671
3672 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
3673 int r;
3674
3675 assert(fds);
3676 assert(n_fds);
3677 assert(*n_fds < fds_size);
3678 assert(fd);
3679
3680 if (*fd < 0)
3681 return 0;
3682
3683 if (*fd < 3 + (int) *n_fds) {
3684 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3685 * the fds we pass to the process (or which are closed only during execve). */
3686
3687 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3688 if (r < 0)
3689 return -errno;
3690
3691 close_and_replace(*fd, r);
3692 }
3693
3694 fds[(*n_fds)++] = *fd;
3695 return 1;
3696 }
3697
3698 static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3699 union sockaddr_union addr = {
3700 .un.sun_family = AF_UNIX,
3701 };
3702 socklen_t sa_len;
3703 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3704 int r;
3705
3706 assert(c);
3707 assert(p);
3708 assert(of);
3709 assert(ofd >= 0);
3710
3711 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3712 if (r < 0)
3713 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3714
3715 sa_len = r;
3716
3717 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3718 _cleanup_close_ int fd = -EBADF;
3719
3720 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3721 if (fd < 0)
3722 return log_exec_error_errno(c,
3723 p,
3724 errno,
3725 "Failed to create socket for %s: %m",
3726 of->path);
3727
3728 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3729 if (r == -EPROTOTYPE)
3730 continue;
3731 if (r < 0)
3732 return log_exec_error_errno(c,
3733 p,
3734 r,
3735 "Failed to connect socket for %s: %m",
3736 of->path);
3737
3738 return TAKE_FD(fd);
3739 }
3740
3741 return log_exec_error_errno(c,
3742 p,
3743 SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3744 of->path);
3745 }
3746
3747 static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3748 struct stat st;
3749 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3750
3751 assert(c);
3752 assert(p);
3753 assert(of);
3754
3755 ofd = open(of->path, O_PATH | O_CLOEXEC);
3756 if (ofd < 0)
3757 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3758
3759 if (fstat(ofd, &st) < 0)
3760 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3761
3762 if (S_ISSOCK(st.st_mode)) {
3763 fd = connect_unix_harder(c, p, of, ofd);
3764 if (fd < 0)
3765 return fd;
3766
3767 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3768 return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3769 of->path);
3770
3771 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3772 } else {
3773 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3774 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3775 flags |= O_APPEND;
3776 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3777 flags |= O_TRUNC;
3778
3779 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3780 if (fd < 0)
3781 return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3782
3783 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3784 }
3785
3786 return TAKE_FD(fd);
3787 }
3788
3789 static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
3790 int r;
3791
3792 assert(c);
3793 assert(p);
3794 assert(n_fds);
3795
3796 LIST_FOREACH(open_files, of, p->open_files) {
3797 _cleanup_close_ int fd = -EBADF;
3798
3799 fd = get_open_file_fd(c, p, of);
3800 if (fd < 0) {
3801 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3802 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3803 continue;
3804 }
3805
3806 return fd;
3807 }
3808
3809 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3810 return -ENOMEM;
3811
3812 r = strv_extend(&p->fd_names, of->fdname);
3813 if (r < 0)
3814 return r;
3815
3816 p->fds[*n_fds] = TAKE_FD(fd);
3817
3818 (*n_fds)++;
3819 }
3820
3821 return 0;
3822 }
3823
3824 static void log_command_line(
3825 const ExecContext *context,
3826 const ExecParameters *params,
3827 const char *msg,
3828 const char *executable,
3829 char **argv) {
3830
3831 assert(context);
3832 assert(params);
3833 assert(msg);
3834 assert(executable);
3835
3836 if (!DEBUG_LOGGING)
3837 return;
3838
3839 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3840
3841 log_exec_struct(context, params, LOG_DEBUG,
3842 "EXECUTABLE=%s", executable,
3843 LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3844 LOG_EXEC_INVOCATION_ID(params));
3845 }
3846
3847 static bool exec_context_need_unprivileged_private_users(
3848 const ExecContext *context,
3849 const ExecParameters *params) {
3850
3851 assert(context);
3852 assert(params);
3853
3854 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3855 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3856 * (system manager) then we have privileges and don't need this. */
3857 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3858 return false;
3859
3860 return context->private_users ||
3861 context->private_tmp ||
3862 context->private_devices ||
3863 context->private_network ||
3864 context->network_namespace_path ||
3865 context->private_ipc ||
3866 context->ipc_namespace_path ||
3867 context->private_mounts > 0 ||
3868 context->mount_apivfs ||
3869 context->n_bind_mounts > 0 ||
3870 context->n_temporary_filesystems > 0 ||
3871 context->root_directory ||
3872 !strv_isempty(context->extension_directories) ||
3873 context->protect_system != PROTECT_SYSTEM_NO ||
3874 context->protect_home != PROTECT_HOME_NO ||
3875 context->protect_kernel_tunables ||
3876 context->protect_kernel_modules ||
3877 context->protect_kernel_logs ||
3878 context->protect_control_groups ||
3879 context->protect_clock ||
3880 context->protect_hostname ||
3881 !strv_isempty(context->read_write_paths) ||
3882 !strv_isempty(context->read_only_paths) ||
3883 !strv_isempty(context->inaccessible_paths) ||
3884 !strv_isempty(context->exec_paths) ||
3885 !strv_isempty(context->no_exec_paths);
3886 }
3887
3888 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3889 assert(context);
3890
3891 if (confirm_spawn_disabled())
3892 return false;
3893
3894 /* For some reasons units remaining in the same process group
3895 * as PID 1 fail to acquire the console even if it's not used
3896 * by any process. So skip the confirmation question for them. */
3897 return !context->same_pgrp;
3898 }
3899
3900 static int exec_context_named_iofds(
3901 const ExecContext *c,
3902 const ExecParameters *p,
3903 int named_iofds[static 3]) {
3904
3905 size_t targets;
3906 const char* stdio_fdname[3];
3907 size_t n_fds;
3908
3909 assert(c);
3910 assert(p);
3911 assert(named_iofds);
3912
3913 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3914 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3915 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3916
3917 for (size_t i = 0; i < 3; i++)
3918 stdio_fdname[i] = exec_context_fdname(c, i);
3919
3920 n_fds = p->n_storage_fds + p->n_socket_fds;
3921
3922 for (size_t i = 0; i < n_fds && targets > 0; i++)
3923 if (named_iofds[STDIN_FILENO] < 0 &&
3924 c->std_input == EXEC_INPUT_NAMED_FD &&
3925 stdio_fdname[STDIN_FILENO] &&
3926 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3927
3928 named_iofds[STDIN_FILENO] = p->fds[i];
3929 targets--;
3930
3931 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3932 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3933 stdio_fdname[STDOUT_FILENO] &&
3934 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3935
3936 named_iofds[STDOUT_FILENO] = p->fds[i];
3937 targets--;
3938
3939 } else if (named_iofds[STDERR_FILENO] < 0 &&
3940 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3941 stdio_fdname[STDERR_FILENO] &&
3942 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3943
3944 named_iofds[STDERR_FILENO] = p->fds[i];
3945 targets--;
3946 }
3947
3948 return targets == 0 ? 0 : -ENOENT;
3949 }
3950
3951 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3952 if (!shared)
3953 return;
3954
3955 safe_close_pair(shared->netns_storage_socket);
3956 safe_close_pair(shared->ipcns_storage_socket);
3957 }
3958
3959 static void exec_runtime_close(ExecRuntime *rt) {
3960 if (!rt)
3961 return;
3962
3963 safe_close_pair(rt->ephemeral_storage_socket);
3964
3965 exec_shared_runtime_close(rt->shared);
3966 dynamic_creds_close(rt->dynamic_creds);
3967 }
3968
3969 static void exec_params_close(ExecParameters *p) {
3970 if (!p)
3971 return;
3972
3973 p->stdin_fd = safe_close(p->stdin_fd);
3974 p->stdout_fd = safe_close(p->stdout_fd);
3975 p->stderr_fd = safe_close(p->stderr_fd);
3976 }
3977
3978 int exec_invoke(
3979 const ExecCommand *command,
3980 const ExecContext *context,
3981 ExecParameters *params,
3982 ExecRuntime *runtime,
3983 const CGroupContext *cgroup_context,
3984 int *exit_status) {
3985
3986 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3987 int r, ngids = 0;
3988 _cleanup_free_ gid_t *supplementary_gids = NULL;
3989 const char *username = NULL, *groupname = NULL;
3990 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3991 const char *home = NULL, *shell = NULL;
3992 char **final_argv = NULL;
3993 dev_t journal_stream_dev = 0;
3994 ino_t journal_stream_ino = 0;
3995 bool userns_set_up = false;
3996 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3997 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3998 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3999 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
4000 bool keep_seccomp_privileges = false;
4001 #if HAVE_SELINUX
4002 _cleanup_free_ char *mac_selinux_context_net = NULL;
4003 bool use_selinux = false;
4004 #endif
4005 #if ENABLE_SMACK
4006 bool use_smack = false;
4007 #endif
4008 #if HAVE_APPARMOR
4009 bool use_apparmor = false;
4010 #endif
4011 #if HAVE_SECCOMP
4012 uint64_t saved_bset = 0;
4013 #endif
4014 uid_t saved_uid = getuid();
4015 gid_t saved_gid = getgid();
4016 uid_t uid = UID_INVALID;
4017 gid_t gid = GID_INVALID;
4018 size_t n_fds, /* fds to pass to the child */
4019 n_keep_fds; /* total number of fds not to close */
4020 int secure_bits;
4021 _cleanup_free_ gid_t *gids_after_pam = NULL;
4022 int ngids_after_pam = 0;
4023
4024 int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
4025 size_t n_storage_fds, n_socket_fds;
4026
4027 assert(command);
4028 assert(context);
4029 assert(params);
4030 assert(exit_status);
4031
4032 /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
4033 * and is already applied earlier. Just for safety. */
4034 if (context->log_level_max >= 0)
4035 log_set_max_level(context->log_level_max);
4036
4037 /* Explicitly test for CVE-2021-4034 inspired invocations */
4038 if (!command->path || strv_isempty(command->argv)) {
4039 *exit_status = EXIT_EXEC;
4040 return log_exec_error_errno(
4041 context,
4042 params,
4043 SYNTHETIC_ERRNO(EINVAL),
4044 "Invalid command line arguments.");
4045 }
4046
4047 LOG_CONTEXT_PUSH_EXEC(context, params);
4048
4049 if (context->std_input == EXEC_INPUT_SOCKET ||
4050 context->std_output == EXEC_OUTPUT_SOCKET ||
4051 context->std_error == EXEC_OUTPUT_SOCKET) {
4052
4053 if (params->n_socket_fds > 1)
4054 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4055
4056 if (params->n_socket_fds == 0)
4057 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4058
4059 socket_fd = params->fds[0];
4060 n_storage_fds = n_socket_fds = 0;
4061 } else {
4062 n_socket_fds = params->n_socket_fds;
4063 n_storage_fds = params->n_storage_fds;
4064 }
4065 n_fds = n_socket_fds + n_storage_fds;
4066
4067 r = exec_context_named_iofds(context, params, named_iofds);
4068 if (r < 0)
4069 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
4070
4071 rename_process_from_path(command->path);
4072
4073 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4074 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4075 * both of which will be demoted to SIG_DFL. */
4076 (void) default_signals(SIGNALS_CRASH_HANDLER,
4077 SIGNALS_IGNORE);
4078
4079 if (context->ignore_sigpipe)
4080 (void) ignore_signals(SIGPIPE);
4081
4082 r = reset_signal_mask();
4083 if (r < 0) {
4084 *exit_status = EXIT_SIGNAL_MASK;
4085 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4086 }
4087
4088 if (params->idle_pipe)
4089 do_idle_pipe_dance(params->idle_pipe);
4090
4091 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4092 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4093 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4094 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4095
4096 log_forget_fds();
4097 log_set_open_when_needed(true);
4098 log_settle_target();
4099
4100 /* In case anything used libc syslog(), close this here, too */
4101 closelog();
4102
4103 r = collect_open_file_fds(context, params, &n_fds);
4104 if (r < 0) {
4105 *exit_status = EXIT_FDS;
4106 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4107 }
4108
4109 int keep_fds[n_fds + 3];
4110 memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4111 n_keep_fds = n_fds;
4112
4113 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4114 if (r < 0) {
4115 *exit_status = EXIT_FDS;
4116 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4117 }
4118
4119 #if HAVE_LIBBPF
4120 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
4121 if (r < 0) {
4122 *exit_status = EXIT_FDS;
4123 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4124 }
4125 #endif
4126
4127 r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4128 if (r < 0) {
4129 *exit_status = EXIT_FDS;
4130 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4131 }
4132
4133 if (!context->same_pgrp &&
4134 setsid() < 0) {
4135 *exit_status = EXIT_SETSID;
4136 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4137 }
4138
4139 exec_context_tty_reset(context, params);
4140
4141 if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4142 _cleanup_free_ char *cmdline = NULL;
4143
4144 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4145 if (!cmdline) {
4146 *exit_status = EXIT_MEMORY;
4147 return log_oom();
4148 }
4149
4150 r = ask_for_confirmation(context, params, cmdline);
4151 if (r != CONFIRM_EXECUTE) {
4152 if (r == CONFIRM_PRETEND_SUCCESS) {
4153 *exit_status = EXIT_SUCCESS;
4154 return 0;
4155 }
4156
4157 *exit_status = EXIT_CONFIRM;
4158 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4159 "Execution cancelled by the user");
4160 }
4161 }
4162
4163 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4164 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4165 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4166 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4167 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4168 if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4169 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4170 *exit_status = EXIT_MEMORY;
4171 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4172 }
4173
4174 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4175 _cleanup_strv_free_ char **suggested_paths = NULL;
4176
4177 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4178 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4179 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4180 *exit_status = EXIT_USER;
4181 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4182 }
4183
4184 r = compile_suggested_paths(context, params, &suggested_paths);
4185 if (r < 0) {
4186 *exit_status = EXIT_MEMORY;
4187 return log_oom();
4188 }
4189
4190 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4191 if (r < 0) {
4192 *exit_status = EXIT_USER;
4193 if (r == -EILSEQ)
4194 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4195 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4196 return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4197 }
4198
4199 if (!uid_is_valid(uid)) {
4200 *exit_status = EXIT_USER;
4201 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4202 }
4203
4204 if (!gid_is_valid(gid)) {
4205 *exit_status = EXIT_USER;
4206 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4207 }
4208
4209 if (runtime->dynamic_creds->user)
4210 username = runtime->dynamic_creds->user->name;
4211
4212 } else {
4213 if (context->user) {
4214 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4215 if (r < 0) {
4216 *exit_status = EXIT_USER;
4217 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4218 }
4219 }
4220
4221 if (context->group) {
4222 r = get_fixed_group(context->group, &groupname, &gid);
4223 if (r < 0) {
4224 *exit_status = EXIT_GROUP;
4225 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4226 }
4227 }
4228 }
4229
4230 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4231 r = get_supplementary_groups(context, username, groupname, gid,
4232 &supplementary_gids, &ngids);
4233 if (r < 0) {
4234 *exit_status = EXIT_GROUP;
4235 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4236 }
4237
4238 r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4239 if (r < 0) {
4240 *exit_status = EXIT_USER;
4241 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4242 }
4243
4244 params->user_lookup_fd = safe_close(params->user_lookup_fd);
4245
4246 r = acquire_home(context, uid, &home, &home_buffer);
4247 if (r < 0) {
4248 *exit_status = EXIT_CHDIR;
4249 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4250 }
4251
4252 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4253 if (socket_fd >= 0)
4254 (void) fd_nonblock(socket_fd, false);
4255
4256 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4257 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4258 if (params->cgroup_path) {
4259 _cleanup_free_ char *p = NULL;
4260
4261 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4262 if (r < 0) {
4263 *exit_status = EXIT_CGROUP;
4264 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4265 }
4266
4267 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4268 if (r == -EUCLEAN) {
4269 *exit_status = EXIT_CGROUP;
4270 return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4271 "because the cgroup or one of its parents or "
4272 "siblings is in the threaded mode: %m", p);
4273 }
4274 if (r < 0) {
4275 *exit_status = EXIT_CGROUP;
4276 return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4277 }
4278 }
4279
4280 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4281 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4282 if (r < 0) {
4283 *exit_status = EXIT_NETWORK;
4284 return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4285 }
4286 }
4287
4288 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4289 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4290 if (r < 0) {
4291 *exit_status = EXIT_NAMESPACE;
4292 return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4293 }
4294 }
4295
4296 r = setup_input(context, params, socket_fd, named_iofds);
4297 if (r < 0) {
4298 *exit_status = EXIT_STDIN;
4299 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4300 }
4301
4302 r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4303 if (r < 0) {
4304 *exit_status = EXIT_STDOUT;
4305 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4306 }
4307
4308 r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4309 if (r < 0) {
4310 *exit_status = EXIT_STDERR;
4311 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4312 }
4313
4314 if (context->oom_score_adjust_set) {
4315 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4316 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4317 r = set_oom_score_adjust(context->oom_score_adjust);
4318 if (ERRNO_IS_NEG_PRIVILEGE(r))
4319 log_exec_debug_errno(context, params, r,
4320 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4321 else if (r < 0) {
4322 *exit_status = EXIT_OOM_ADJUST;
4323 return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4324 }
4325 }
4326
4327 if (context->coredump_filter_set) {
4328 r = set_coredump_filter(context->coredump_filter);
4329 if (ERRNO_IS_NEG_PRIVILEGE(r))
4330 log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4331 else if (r < 0) {
4332 *exit_status = EXIT_LIMITS;
4333 return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4334 }
4335 }
4336
4337 if (context->nice_set) {
4338 r = setpriority_closest(context->nice);
4339 if (r < 0) {
4340 *exit_status = EXIT_NICE;
4341 return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4342 }
4343 }
4344
4345 if (context->cpu_sched_set) {
4346 struct sched_param param = {
4347 .sched_priority = context->cpu_sched_priority,
4348 };
4349
4350 r = sched_setscheduler(0,
4351 context->cpu_sched_policy |
4352 (context->cpu_sched_reset_on_fork ?
4353 SCHED_RESET_ON_FORK : 0),
4354 &param);
4355 if (r < 0) {
4356 *exit_status = EXIT_SETSCHEDULER;
4357 return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4358 }
4359 }
4360
4361 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4362 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4363 const CPUSet *cpu_set;
4364
4365 if (context->cpu_affinity_from_numa) {
4366 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4367 if (r < 0) {
4368 *exit_status = EXIT_CPUAFFINITY;
4369 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4370 }
4371
4372 cpu_set = &converted_cpu_set;
4373 } else
4374 cpu_set = &context->cpu_set;
4375
4376 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4377 *exit_status = EXIT_CPUAFFINITY;
4378 return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4379 }
4380 }
4381
4382 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4383 r = apply_numa_policy(&context->numa_policy);
4384 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4385 log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4386 else if (r < 0) {
4387 *exit_status = EXIT_NUMA_POLICY;
4388 return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4389 }
4390 }
4391
4392 if (context->ioprio_set)
4393 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4394 *exit_status = EXIT_IOPRIO;
4395 return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4396 }
4397
4398 if (context->timer_slack_nsec != NSEC_INFINITY)
4399 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4400 *exit_status = EXIT_TIMERSLACK;
4401 return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4402 }
4403
4404 if (context->personality != PERSONALITY_INVALID) {
4405 r = safe_personality(context->personality);
4406 if (r < 0) {
4407 *exit_status = EXIT_PERSONALITY;
4408 return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4409 }
4410 }
4411
4412 #if ENABLE_UTMP
4413 if (context->utmp_id) {
4414 _cleanup_free_ char *username_alloc = NULL;
4415
4416 if (!username && context->utmp_mode == EXEC_UTMP_USER) {
4417 username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
4418 if (!username_alloc) {
4419 *exit_status = EXIT_USER;
4420 return log_oom();
4421 }
4422 }
4423
4424 const char *line = context->tty_path ?
4425 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4426 NULL;
4427 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4428 line,
4429 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4430 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4431 USER_PROCESS,
4432 username ?: username_alloc);
4433 }
4434 #endif
4435
4436 if (uid_is_valid(uid)) {
4437 r = chown_terminal(STDIN_FILENO, uid);
4438 if (r < 0) {
4439 *exit_status = EXIT_STDIN;
4440 return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4441 }
4442 }
4443
4444 if (params->cgroup_path) {
4445 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4446 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4447 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4448 * touch a single hierarchy too. */
4449
4450 if (params->flags & EXEC_CGROUP_DELEGATE) {
4451 _cleanup_free_ char *p = NULL;
4452
4453 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4454 if (r < 0) {
4455 *exit_status = EXIT_CGROUP;
4456 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4457 }
4458
4459 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4460 if (r < 0) {
4461 *exit_status = EXIT_CGROUP;
4462 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4463 }
4464 if (r > 0) {
4465 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4466 if (r < 0) {
4467 *exit_status = EXIT_CGROUP;
4468 return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4469 }
4470 }
4471 }
4472
4473 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4474 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4475 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4476 if (r < 0) {
4477 *exit_status = EXIT_MEMORY;
4478 return log_oom();
4479 }
4480
4481 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4482 if (r < 0) {
4483 log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4484 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4485 memory_pressure_path = mfree(memory_pressure_path);
4486 }
4487 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4488 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4489 if (!memory_pressure_path) {
4490 *exit_status = EXIT_MEMORY;
4491 return log_oom();
4492 }
4493 }
4494 }
4495 }
4496
4497 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4498
4499 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4500 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4501 if (r < 0)
4502 return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4503 }
4504
4505 r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4506 if (r < 0) {
4507 *exit_status = EXIT_CREDENTIALS;
4508 return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4509 }
4510
4511 r = build_environment(
4512 context,
4513 params,
4514 cgroup_context,
4515 n_fds,
4516 home,
4517 username,
4518 shell,
4519 journal_stream_dev,
4520 journal_stream_ino,
4521 memory_pressure_path,
4522 &our_env);
4523 if (r < 0) {
4524 *exit_status = EXIT_MEMORY;
4525 return log_oom();
4526 }
4527
4528 r = build_pass_environment(context, &pass_env);
4529 if (r < 0) {
4530 *exit_status = EXIT_MEMORY;
4531 return log_oom();
4532 }
4533
4534 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4535 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4536 * not specify PATH but the unit has ExecSearchPath. */
4537 if (!strv_isempty(context->exec_search_path)) {
4538 _cleanup_free_ char *joined = NULL;
4539
4540 joined = strv_join(context->exec_search_path, ":");
4541 if (!joined) {
4542 *exit_status = EXIT_MEMORY;
4543 return log_oom();
4544 }
4545
4546 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4547 if (r < 0) {
4548 *exit_status = EXIT_MEMORY;
4549 return log_oom();
4550 }
4551 }
4552
4553 accum_env = strv_env_merge(params->environment,
4554 our_env,
4555 joined_exec_search_path,
4556 pass_env,
4557 context->environment,
4558 params->files_env);
4559 if (!accum_env) {
4560 *exit_status = EXIT_MEMORY;
4561 return log_oom();
4562 }
4563 accum_env = strv_env_clean(accum_env);
4564
4565 (void) umask(context->umask);
4566
4567 r = setup_keyring(context, params, uid, gid);
4568 if (r < 0) {
4569 *exit_status = EXIT_KEYRING;
4570 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4571 }
4572
4573 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4574 * from it. */
4575 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4576
4577 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4578 * for it, and the kernel doesn't actually support ambient caps. */
4579 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4580
4581 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4582 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4583 * desired. */
4584 if (needs_ambient_hack)
4585 needs_setuid = false;
4586 else
4587 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4588
4589 uint64_t capability_ambient_set = context->capability_ambient_set;
4590
4591 if (needs_sandboxing) {
4592 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4593 * /sys being present. The actual MAC context application will happen later, as late as
4594 * possible, to avoid impacting our own code paths. */
4595
4596 #if HAVE_SELINUX
4597 use_selinux = mac_selinux_use();
4598 #endif
4599 #if ENABLE_SMACK
4600 use_smack = mac_smack_use();
4601 #endif
4602 #if HAVE_APPARMOR
4603 use_apparmor = mac_apparmor_use();
4604 #endif
4605 }
4606
4607 if (needs_sandboxing) {
4608 int which_failed;
4609
4610 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4611 * is set here. (See below.) */
4612
4613 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4614 if (r < 0) {
4615 *exit_status = EXIT_LIMITS;
4616 return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4617 }
4618 }
4619
4620 if (needs_setuid && context->pam_name && username) {
4621 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4622 * wins here. (See above.) */
4623
4624 /* All fds passed in the fds array will be closed in the pam child process. */
4625 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds, params->exec_fd);
4626 if (r < 0) {
4627 *exit_status = EXIT_PAM;
4628 return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4629 }
4630
4631 if (ambient_capabilities_supported()) {
4632 uint64_t ambient_after_pam;
4633
4634 /* PAM modules might have set some ambient caps. Query them here and merge them into
4635 * the caps we want to set in the end, so that we don't end up unsetting them. */
4636 r = capability_get_ambient(&ambient_after_pam);
4637 if (r < 0) {
4638 *exit_status = EXIT_CAPABILITIES;
4639 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4640 }
4641
4642 capability_ambient_set |= ambient_after_pam;
4643 }
4644
4645 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4646 if (ngids_after_pam < 0) {
4647 *exit_status = EXIT_GROUP;
4648 return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4649 }
4650 }
4651
4652 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4653 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4654 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4655 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4656
4657 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4658 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4659 * the actual requested operations fail (or silently continue). */
4660 if (r < 0 && context->private_users) {
4661 *exit_status = EXIT_USER;
4662 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4663 }
4664 if (r < 0)
4665 log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4666 else
4667 userns_set_up = true;
4668 }
4669
4670 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4671
4672 /* Try to enable network namespacing if network namespacing is available and we have
4673 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4674 * new network namespace. And if we don't have that, then we could only create a network
4675 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4676 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4677 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4678 if (ERRNO_IS_NEG_PRIVILEGE(r))
4679 log_exec_notice_errno(context, params, r,
4680 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4681 else if (r < 0) {
4682 *exit_status = EXIT_NETWORK;
4683 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4684 }
4685 } else if (context->network_namespace_path) {
4686 *exit_status = EXIT_NETWORK;
4687 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4688 "NetworkNamespacePath= is not supported, refusing.");
4689 } else
4690 log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4691 }
4692
4693 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4694
4695 if (ns_type_supported(NAMESPACE_IPC)) {
4696 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4697 if (r == -EPERM)
4698 log_exec_warning_errno(context, params, r,
4699 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4700 else if (r < 0) {
4701 *exit_status = EXIT_NAMESPACE;
4702 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4703 }
4704 } else if (context->ipc_namespace_path) {
4705 *exit_status = EXIT_NAMESPACE;
4706 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4707 "IPCNamespacePath= is not supported, refusing.");
4708 } else
4709 log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4710 }
4711
4712 if (needs_mount_namespace) {
4713 _cleanup_free_ char *error_path = NULL;
4714
4715 r = apply_mount_namespace(command->flags,
4716 context,
4717 params,
4718 runtime,
4719 memory_pressure_path,
4720 needs_sandboxing,
4721 &error_path);
4722 if (r < 0) {
4723 *exit_status = EXIT_NAMESPACE;
4724 return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4725 error_path ? ": " : "", strempty(error_path));
4726 }
4727 }
4728
4729 if (needs_sandboxing) {
4730 r = apply_protect_hostname(context, params, exit_status);
4731 if (r < 0)
4732 return r;
4733 }
4734
4735 if (context->memory_ksm >= 0)
4736 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4737 if (ERRNO_IS_NOT_SUPPORTED(errno))
4738 log_exec_debug_errno(context,
4739 params,
4740 errno,
4741 "KSM support not available, ignoring.");
4742 else {
4743 *exit_status = EXIT_KSM;
4744 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4745 }
4746 }
4747
4748 /* Drop groups as early as possible.
4749 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4750 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4751 if (needs_setuid) {
4752 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4753 int ngids_to_enforce = 0;
4754
4755 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4756 ngids,
4757 gids_after_pam,
4758 ngids_after_pam,
4759 &gids_to_enforce);
4760 if (ngids_to_enforce < 0) {
4761 *exit_status = EXIT_GROUP;
4762 return log_exec_error_errno(context, params,
4763 ngids_to_enforce,
4764 "Failed to merge group lists. Group membership might be incorrect: %m");
4765 }
4766
4767 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4768 if (r < 0) {
4769 *exit_status = EXIT_GROUP;
4770 return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4771 }
4772 }
4773
4774 /* If the user namespace was not set up above, try to do it now.
4775 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4776 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4777 * case of mount namespaces being less privileged when the mount point list is copied from a
4778 * different user namespace). */
4779
4780 if (needs_sandboxing && context->private_users && !userns_set_up) {
4781 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4782 if (r < 0) {
4783 *exit_status = EXIT_USER;
4784 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4785 }
4786 }
4787
4788 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4789 * shall execute. */
4790
4791 _cleanup_free_ char *executable = NULL;
4792 _cleanup_close_ int executable_fd = -EBADF;
4793 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4794 if (r < 0) {
4795 *exit_status = EXIT_EXEC;
4796 log_exec_struct_errno(context, params, LOG_NOTICE, r,
4797 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4798 LOG_EXEC_MESSAGE(params,
4799 "Unable to locate executable '%s': %m",
4800 command->path),
4801 "EXECUTABLE=%s", command->path);
4802 /* If the error will be ignored by manager, tune down the log level here. Missing executable
4803 * is very much expected in this case. */
4804 return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
4805 }
4806
4807 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
4808 if (r < 0) {
4809 *exit_status = EXIT_FDS;
4810 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4811 }
4812
4813 #if HAVE_SELINUX
4814 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4815 int fd = -EBADF;
4816
4817 if (socket_fd >= 0)
4818 fd = socket_fd;
4819 else if (params->n_socket_fds == 1)
4820 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4821 * use context from that fd to compute the label. */
4822 fd = params->fds[0];
4823
4824 if (fd >= 0) {
4825 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4826 if (r < 0) {
4827 if (!context->selinux_context_ignore) {
4828 *exit_status = EXIT_SELINUX_CONTEXT;
4829 return log_exec_error_errno(context,
4830 params,
4831 r,
4832 "Failed to determine SELinux context: %m");
4833 }
4834 log_exec_debug_errno(context,
4835 params,
4836 r,
4837 "Failed to determine SELinux context, ignoring: %m");
4838 }
4839 }
4840 }
4841 #endif
4842
4843 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4844 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4845 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4846 * execve(). But first, close the remaining sockets in the context objects. */
4847
4848 exec_runtime_close(runtime);
4849 exec_params_close(params);
4850
4851 r = close_all_fds(keep_fds, n_keep_fds);
4852 if (r >= 0)
4853 r = pack_fds(params->fds, n_fds);
4854 if (r >= 0)
4855 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
4856 if (r < 0) {
4857 *exit_status = EXIT_FDS;
4858 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4859 }
4860
4861 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4862 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4863 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4864 * came this far. */
4865
4866 secure_bits = context->secure_bits;
4867
4868 if (needs_sandboxing) {
4869 uint64_t bset;
4870
4871 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4872 * (Note this is placed after the general resource limit initialization, see above, in order
4873 * to take precedence.) */
4874 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4875 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4876 *exit_status = EXIT_LIMITS;
4877 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4878 }
4879 }
4880
4881 #if ENABLE_SMACK
4882 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4883 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4884 if (use_smack && context->smack_process_label) {
4885 r = setup_smack(params, context, executable_fd);
4886 if (r < 0 && !context->smack_process_label_ignore) {
4887 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4888 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4889 }
4890 }
4891 #endif
4892
4893 bset = context->capability_bounding_set;
4894 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4895 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4896 * instead of us doing that */
4897 if (needs_ambient_hack)
4898 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4899 (UINT64_C(1) << CAP_SETUID) |
4900 (UINT64_C(1) << CAP_SETGID);
4901
4902 #if HAVE_SECCOMP
4903 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4904 * keep the needed privileges to apply it even if we're not root. */
4905 if (needs_setuid &&
4906 uid_is_valid(uid) &&
4907 context_has_seccomp(context) &&
4908 seccomp_allows_drop_privileges(context)) {
4909 keep_seccomp_privileges = true;
4910
4911 if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4912 *exit_status = EXIT_USER;
4913 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4914 }
4915
4916 /* Save the current bounding set so we can restore it after applying the seccomp
4917 * filter */
4918 saved_bset = bset;
4919 bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4920 (UINT64_C(1) << CAP_SETPCAP);
4921 }
4922 #endif
4923
4924 if (!cap_test_all(bset)) {
4925 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4926 if (r < 0) {
4927 *exit_status = EXIT_CAPABILITIES;
4928 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4929 }
4930 }
4931
4932 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4933 * keep-caps set.
4934 *
4935 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4936 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4937 * the ambient capabilities can be raised as they are present in the permitted and
4938 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4939 * without changing the user, so we also set the ambient capabilities here.
4940 *
4941 * The requested ambient capabilities are raised in the inheritable set if the second
4942 * argument is true. */
4943 if (!needs_ambient_hack) {
4944 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4945 if (r < 0) {
4946 *exit_status = EXIT_CAPABILITIES;
4947 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4948 }
4949 }
4950 }
4951
4952 /* chroot to root directory first, before we lose the ability to chroot */
4953 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4954 if (r < 0)
4955 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
4956
4957 if (needs_setuid) {
4958 if (uid_is_valid(uid)) {
4959 r = enforce_user(context, uid, capability_ambient_set);
4960 if (r < 0) {
4961 *exit_status = EXIT_USER;
4962 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
4963 }
4964
4965 if (keep_seccomp_privileges) {
4966 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
4967 r = drop_capability(CAP_SETUID);
4968 if (r < 0) {
4969 *exit_status = EXIT_USER;
4970 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
4971 }
4972 }
4973
4974 r = keep_capability(CAP_SYS_ADMIN);
4975 if (r < 0) {
4976 *exit_status = EXIT_USER;
4977 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
4978 }
4979
4980 r = keep_capability(CAP_SETPCAP);
4981 if (r < 0) {
4982 *exit_status = EXIT_USER;
4983 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
4984 }
4985 }
4986
4987 if (!needs_ambient_hack && capability_ambient_set != 0) {
4988
4989 /* Raise the ambient capabilities after user change. */
4990 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4991 if (r < 0) {
4992 *exit_status = EXIT_CAPABILITIES;
4993 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
4994 }
4995 }
4996 }
4997 }
4998
4999 /* Apply working directory here, because the working directory might be on NFS and only the user
5000 * running this service might have the correct privilege to change to the working directory. Also, it
5001 * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5002 * the cwd cannot be used to pin directories outside of the sandbox. */
5003 r = apply_working_directory(context, params, runtime, home, exit_status);
5004 if (r < 0)
5005 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
5006
5007 if (needs_sandboxing) {
5008 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5009 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5010 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5011 * are restricted. */
5012
5013 #if HAVE_SELINUX
5014 if (use_selinux) {
5015 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5016
5017 if (exec_context) {
5018 r = setexeccon(exec_context);
5019 if (r < 0) {
5020 if (!context->selinux_context_ignore) {
5021 *exit_status = EXIT_SELINUX_CONTEXT;
5022 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
5023 }
5024 log_exec_debug_errno(context,
5025 params,
5026 r,
5027 "Failed to change SELinux context to %s, ignoring: %m",
5028 exec_context);
5029 }
5030 }
5031 }
5032 #endif
5033
5034 #if HAVE_APPARMOR
5035 if (use_apparmor && context->apparmor_profile) {
5036 r = aa_change_onexec(context->apparmor_profile);
5037 if (r < 0 && !context->apparmor_profile_ignore) {
5038 *exit_status = EXIT_APPARMOR_PROFILE;
5039 return log_exec_error_errno(context,
5040 params,
5041 errno,
5042 "Failed to prepare AppArmor profile change to %s: %m",
5043 context->apparmor_profile);
5044 }
5045 }
5046 #endif
5047
5048 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5049 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5050 * requires CAP_SETPCAP. */
5051 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5052 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5053 * effective set here.
5054 *
5055 * The effective set is overwritten during execve() with the following values:
5056 *
5057 * - ambient set (for non-root processes)
5058 *
5059 * - (inheritable | bounding) set for root processes)
5060 *
5061 * Hence there is no security impact to raise it in the effective set before execve
5062 */
5063 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5064 if (r < 0) {
5065 *exit_status = EXIT_CAPABILITIES;
5066 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5067 }
5068 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5069 *exit_status = EXIT_SECUREBITS;
5070 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
5071 }
5072 }
5073
5074 if (context_has_no_new_privileges(context))
5075 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5076 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5077 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
5078 }
5079
5080 #if HAVE_SECCOMP
5081 r = apply_address_families(context, params);
5082 if (r < 0) {
5083 *exit_status = EXIT_ADDRESS_FAMILIES;
5084 return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
5085 }
5086
5087 r = apply_memory_deny_write_execute(context, params);
5088 if (r < 0) {
5089 *exit_status = EXIT_SECCOMP;
5090 return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5091 }
5092
5093 r = apply_restrict_realtime(context, params);
5094 if (r < 0) {
5095 *exit_status = EXIT_SECCOMP;
5096 return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5097 }
5098
5099 r = apply_restrict_suid_sgid(context, params);
5100 if (r < 0) {
5101 *exit_status = EXIT_SECCOMP;
5102 return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5103 }
5104
5105 r = apply_restrict_namespaces(context, params);
5106 if (r < 0) {
5107 *exit_status = EXIT_SECCOMP;
5108 return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5109 }
5110
5111 r = apply_protect_sysctl(context, params);
5112 if (r < 0) {
5113 *exit_status = EXIT_SECCOMP;
5114 return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5115 }
5116
5117 r = apply_protect_kernel_modules(context, params);
5118 if (r < 0) {
5119 *exit_status = EXIT_SECCOMP;
5120 return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5121 }
5122
5123 r = apply_protect_kernel_logs(context, params);
5124 if (r < 0) {
5125 *exit_status = EXIT_SECCOMP;
5126 return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5127 }
5128
5129 r = apply_protect_clock(context, params);
5130 if (r < 0) {
5131 *exit_status = EXIT_SECCOMP;
5132 return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5133 }
5134
5135 r = apply_private_devices(context, params);
5136 if (r < 0) {
5137 *exit_status = EXIT_SECCOMP;
5138 return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5139 }
5140
5141 r = apply_syscall_archs(context, params);
5142 if (r < 0) {
5143 *exit_status = EXIT_SECCOMP;
5144 return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5145 }
5146
5147 r = apply_lock_personality(context, params);
5148 if (r < 0) {
5149 *exit_status = EXIT_SECCOMP;
5150 return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5151 }
5152
5153 r = apply_syscall_log(context, params);
5154 if (r < 0) {
5155 *exit_status = EXIT_SECCOMP;
5156 return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5157 }
5158 #endif
5159
5160 #if HAVE_LIBBPF
5161 r = apply_restrict_filesystems(context, params);
5162 if (r < 0) {
5163 *exit_status = EXIT_BPF;
5164 return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5165 }
5166 #endif
5167
5168 #if HAVE_SECCOMP
5169 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5170 * by the filter as little as possible. */
5171 r = apply_syscall_filter(context, params, needs_ambient_hack);
5172 if (r < 0) {
5173 *exit_status = EXIT_SECCOMP;
5174 return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5175 }
5176
5177 if (keep_seccomp_privileges) {
5178 /* Restore the capability bounding set with what's expected from the service + the
5179 * ambient capabilities hack */
5180 if (!cap_test_all(saved_bset)) {
5181 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5182 if (r < 0) {
5183 *exit_status = EXIT_CAPABILITIES;
5184 return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5185 }
5186 }
5187
5188 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5189 * applications that use it. */
5190 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5191 r = drop_capability(CAP_SYS_ADMIN);
5192 if (r < 0) {
5193 *exit_status = EXIT_USER;
5194 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5195 }
5196 }
5197
5198 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5199 * applications that use it. */
5200 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5201 r = drop_capability(CAP_SETPCAP);
5202 if (r < 0) {
5203 *exit_status = EXIT_USER;
5204 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5205 }
5206 }
5207
5208 if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5209 *exit_status = EXIT_USER;
5210 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5211 }
5212 }
5213 #endif
5214
5215 }
5216
5217 if (!strv_isempty(context->unset_environment)) {
5218 char **ee = NULL;
5219
5220 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5221 if (!ee) {
5222 *exit_status = EXIT_MEMORY;
5223 return log_oom();
5224 }
5225
5226 strv_free_and_replace(accum_env, ee);
5227 }
5228
5229 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5230 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5231
5232 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5233 if (r < 0) {
5234 *exit_status = EXIT_MEMORY;
5235 return log_exec_error_errno(context,
5236 params,
5237 r,
5238 "Failed to replace environment variables: %m");
5239 }
5240 final_argv = replaced_argv;
5241
5242 if (!strv_isempty(unset_variables)) {
5243 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5244 log_exec_warning(context,
5245 params,
5246 "Referenced but unset environment variable evaluates to an empty string: %s",
5247 strna(ju));
5248 }
5249
5250 if (!strv_isempty(bad_variables)) {
5251 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5252 log_exec_warning(context,
5253 params,
5254 "Invalid environment variable name evaluates to an empty string: %s",
5255 strna(jb));
5256 }
5257 } else
5258 final_argv = command->argv;
5259
5260 log_command_line(context, params, "Executing", executable, final_argv);
5261
5262 if (params->exec_fd >= 0) {
5263 uint8_t hot = 1;
5264
5265 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5266 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5267
5268 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5269 *exit_status = EXIT_EXEC;
5270 return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5271 }
5272 }
5273
5274 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5275
5276 if (params->exec_fd >= 0) {
5277 uint8_t hot = 0;
5278
5279 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5280 * that POLLHUP on it no longer means execve() succeeded. */
5281
5282 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5283 *exit_status = EXIT_EXEC;
5284 return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5285 }
5286 }
5287
5288 *exit_status = EXIT_EXEC;
5289 return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5290 }