]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/exec-invoke.c
logs-show: use journal_add_matchf() and journal_add_match_pair()
[thirdparty/systemd.git] / src / core / exec-invoke.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/eventfd.h>
4 #include <sys/ioctl.h>
5 #include <sys/mount.h>
6 #include <sys/prctl.h>
7
8 #if HAVE_PAM
9 #include <security/pam_appl.h>
10 #include <security/pam_misc.h>
11 #endif
12
13 #if HAVE_APPARMOR
14 #include <sys/apparmor.h>
15 #endif
16
17 #include "sd-messages.h"
18
19 #if HAVE_APPARMOR
20 #include "apparmor-util.h"
21 #endif
22 #include "argv-util.h"
23 #include "barrier.h"
24 #include "bpf-dlopen.h"
25 #include "bpf-restrict-fs.h"
26 #include "btrfs-util.h"
27 #include "capability-util.h"
28 #include "cgroup-setup.h"
29 #include "chase.h"
30 #include "chattr-util.h"
31 #include "chown-recursive.h"
32 #include "copy.h"
33 #include "data-fd-util.h"
34 #include "env-util.h"
35 #include "escape.h"
36 #include "exec-credential.h"
37 #include "exec-invoke.h"
38 #include "execute.h"
39 #include "exit-status.h"
40 #include "fd-util.h"
41 #include "hexdecoct.h"
42 #include "io-util.h"
43 #include "iovec-util.h"
44 #include "journal-send.h"
45 #include "missing_ioprio.h"
46 #include "missing_prctl.h"
47 #include "missing_securebits.h"
48 #include "missing_syscall.h"
49 #include "mkdir-label.h"
50 #include "proc-cmdline.h"
51 #include "process-util.h"
52 #include "psi-util.h"
53 #include "rlimit-util.h"
54 #include "seccomp-util.h"
55 #include "selinux-util.h"
56 #include "signal-util.h"
57 #include "smack-util.h"
58 #include "socket-util.h"
59 #include "string-table.h"
60 #include "strv.h"
61 #include "terminal-util.h"
62 #include "utmp-wtmp.h"
63 #include "vpick.h"
64
65 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
66 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
67
68 #define SNDBUF_SIZE (8*1024*1024)
69
70 static int flag_fds(
71 const int fds[],
72 size_t n_socket_fds,
73 size_t n_fds,
74 bool nonblock) {
75
76 int r;
77
78 assert(fds || n_fds == 0);
79
80 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
81 * O_NONBLOCK only applies to socket activation though. */
82
83 for (size_t i = 0; i < n_fds; i++) {
84
85 if (i < n_socket_fds) {
86 r = fd_nonblock(fds[i], nonblock);
87 if (r < 0)
88 return r;
89 }
90
91 /* We unconditionally drop FD_CLOEXEC from the fds,
92 * since after all we want to pass these fds to our
93 * children */
94
95 r = fd_cloexec(fds[i], false);
96 if (r < 0)
97 return r;
98 }
99
100 return 0;
101 }
102
103 static bool is_terminal_input(ExecInput i) {
104 return IN_SET(i,
105 EXEC_INPUT_TTY,
106 EXEC_INPUT_TTY_FORCE,
107 EXEC_INPUT_TTY_FAIL);
108 }
109
110 static bool is_terminal_output(ExecOutput o) {
111 return IN_SET(o,
112 EXEC_OUTPUT_TTY,
113 EXEC_OUTPUT_KMSG_AND_CONSOLE,
114 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
115 }
116
117 static bool is_kmsg_output(ExecOutput o) {
118 return IN_SET(o,
119 EXEC_OUTPUT_KMSG,
120 EXEC_OUTPUT_KMSG_AND_CONSOLE);
121 }
122
123 static bool exec_context_needs_term(const ExecContext *c) {
124 assert(c);
125
126 /* Return true if the execution context suggests we should set $TERM to something useful. */
127
128 if (is_terminal_input(c->std_input))
129 return true;
130
131 if (is_terminal_output(c->std_output))
132 return true;
133
134 if (is_terminal_output(c->std_error))
135 return true;
136
137 return !!c->tty_path;
138 }
139
140 static int open_null_as(int flags, int nfd) {
141 int fd;
142
143 assert(nfd >= 0);
144
145 fd = open("/dev/null", flags|O_NOCTTY);
146 if (fd < 0)
147 return -errno;
148
149 return move_fd(fd, nfd, false);
150 }
151
152 static int connect_journal_socket(
153 int fd,
154 const char *log_namespace,
155 uid_t uid,
156 gid_t gid) {
157
158 uid_t olduid = UID_INVALID;
159 gid_t oldgid = GID_INVALID;
160 const char *j;
161 int r;
162
163 assert(fd >= 0);
164
165 j = journal_stream_path(log_namespace);
166 if (!j)
167 return -EINVAL;
168
169 if (gid_is_valid(gid)) {
170 oldgid = getgid();
171
172 if (setegid(gid) < 0)
173 return -errno;
174 }
175
176 if (uid_is_valid(uid)) {
177 olduid = getuid();
178
179 if (seteuid(uid) < 0) {
180 r = -errno;
181 goto restore_gid;
182 }
183 }
184
185 r = connect_unix_path(fd, AT_FDCWD, j);
186
187 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
188 an LSM interferes. */
189
190 if (uid_is_valid(uid))
191 (void) seteuid(olduid);
192
193 restore_gid:
194 if (gid_is_valid(gid))
195 (void) setegid(oldgid);
196
197 return r;
198 }
199
200 static int connect_logger_as(
201 const ExecContext *context,
202 const ExecParameters *params,
203 ExecOutput output,
204 const char *ident,
205 int nfd,
206 uid_t uid,
207 gid_t gid) {
208
209 _cleanup_close_ int fd = -EBADF;
210 int r;
211
212 assert(context);
213 assert(params);
214 assert(output < _EXEC_OUTPUT_MAX);
215 assert(ident);
216 assert(nfd >= 0);
217
218 fd = socket(AF_UNIX, SOCK_STREAM, 0);
219 if (fd < 0)
220 return -errno;
221
222 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
223 if (r < 0)
224 return r;
225
226 if (shutdown(fd, SHUT_RD) < 0)
227 return -errno;
228
229 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
230
231 if (dprintf(fd,
232 "%s\n"
233 "%s\n"
234 "%i\n"
235 "%i\n"
236 "%i\n"
237 "%i\n"
238 "%i\n",
239 context->syslog_identifier ?: ident,
240 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
241 context->syslog_priority,
242 !!context->syslog_level_prefix,
243 false,
244 is_kmsg_output(output),
245 is_terminal_output(output)) < 0)
246 return -errno;
247
248 return move_fd(TAKE_FD(fd), nfd, false);
249 }
250
251 static int open_terminal_as(const char *path, int flags, int nfd) {
252 int fd;
253
254 assert(path);
255 assert(nfd >= 0);
256
257 fd = open_terminal(path, flags | O_NOCTTY);
258 if (fd < 0)
259 return fd;
260
261 return move_fd(fd, nfd, false);
262 }
263
264 static int acquire_path(const char *path, int flags, mode_t mode) {
265 _cleanup_close_ int fd = -EBADF;
266 int r;
267
268 assert(path);
269
270 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
271 flags |= O_CREAT;
272
273 fd = open(path, flags|O_NOCTTY, mode);
274 if (fd >= 0)
275 return TAKE_FD(fd);
276
277 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
278 return -errno;
279
280 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
281
282 fd = socket(AF_UNIX, SOCK_STREAM, 0);
283 if (fd < 0)
284 return -errno;
285
286 r = connect_unix_path(fd, AT_FDCWD, path);
287 if (IN_SET(r, -ENOTSOCK, -EINVAL))
288 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
289 * wasn't an AF_UNIX socket after all */
290 return -ENXIO;
291 if (r < 0)
292 return r;
293
294 if ((flags & O_ACCMODE) == O_RDONLY)
295 r = shutdown(fd, SHUT_WR);
296 else if ((flags & O_ACCMODE) == O_WRONLY)
297 r = shutdown(fd, SHUT_RD);
298 else
299 r = 0;
300 if (r < 0)
301 return -errno;
302
303 return TAKE_FD(fd);
304 }
305
306 static int fixup_input(
307 const ExecContext *context,
308 int socket_fd,
309 bool apply_tty_stdin) {
310
311 ExecInput std_input;
312
313 assert(context);
314
315 std_input = context->std_input;
316
317 if (is_terminal_input(std_input) && !apply_tty_stdin)
318 return EXEC_INPUT_NULL;
319
320 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
321 return EXEC_INPUT_NULL;
322
323 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
324 return EXEC_INPUT_NULL;
325
326 return std_input;
327 }
328
329 static int fixup_output(ExecOutput output, int socket_fd) {
330
331 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
332 return EXEC_OUTPUT_INHERIT;
333
334 return output;
335 }
336
337 static int setup_input(
338 const ExecContext *context,
339 const ExecParameters *params,
340 int socket_fd,
341 const int named_iofds[static 3]) {
342
343 ExecInput i;
344 int r;
345
346 assert(context);
347 assert(params);
348 assert(named_iofds);
349
350 if (params->stdin_fd >= 0) {
351 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
352 return -errno;
353
354 /* Try to make this the controlling tty, if it is a tty, and reset it */
355 if (isatty(STDIN_FILENO)) {
356 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
357
358 if (context->tty_reset)
359 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
360
361 (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
362 }
363
364 return STDIN_FILENO;
365 }
366
367 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
368
369 switch (i) {
370
371 case EXEC_INPUT_NULL:
372 return open_null_as(O_RDONLY, STDIN_FILENO);
373
374 case EXEC_INPUT_TTY:
375 case EXEC_INPUT_TTY_FORCE:
376 case EXEC_INPUT_TTY_FAIL: {
377 _cleanup_close_ int tty_fd = -EBADF;
378 const char *tty_path;
379
380 tty_path = ASSERT_PTR(exec_context_tty_path(context));
381
382 tty_fd = acquire_terminal(tty_path,
383 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
384 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
385 ACQUIRE_TERMINAL_WAIT,
386 USEC_INFINITY);
387 if (tty_fd < 0)
388 return tty_fd;
389
390 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
391 if (r < 0)
392 return r;
393
394 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
395 if (r < 0)
396 return r;
397
398 TAKE_FD(tty_fd);
399 return r;
400 }
401
402 case EXEC_INPUT_SOCKET:
403 assert(socket_fd >= 0);
404
405 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
406
407 case EXEC_INPUT_NAMED_FD:
408 assert(named_iofds[STDIN_FILENO] >= 0);
409
410 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
411 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
412
413 case EXEC_INPUT_DATA: {
414 int fd;
415
416 fd = acquire_data_fd_full(context->stdin_data, context->stdin_data_size, /* flags = */ 0);
417 if (fd < 0)
418 return fd;
419
420 return move_fd(fd, STDIN_FILENO, false);
421 }
422
423 case EXEC_INPUT_FILE: {
424 bool rw;
425 int fd;
426
427 assert(context->stdio_file[STDIN_FILENO]);
428
429 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
430 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
431
432 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
433 if (fd < 0)
434 return fd;
435
436 return move_fd(fd, STDIN_FILENO, false);
437 }
438
439 default:
440 assert_not_reached();
441 }
442 }
443
444 static bool can_inherit_stderr_from_stdout(
445 const ExecContext *context,
446 ExecOutput o,
447 ExecOutput e) {
448
449 assert(context);
450
451 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
452 * stderr fd */
453
454 if (e == EXEC_OUTPUT_INHERIT)
455 return true;
456 if (e != o)
457 return false;
458
459 if (e == EXEC_OUTPUT_NAMED_FD)
460 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
461
462 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
463 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
464
465 return true;
466 }
467
468 static int setup_output(
469 const ExecContext *context,
470 const ExecParameters *params,
471 int fileno,
472 int socket_fd,
473 const int named_iofds[static 3],
474 const char *ident,
475 uid_t uid,
476 gid_t gid,
477 dev_t *journal_stream_dev,
478 ino_t *journal_stream_ino) {
479
480 ExecOutput o;
481 ExecInput i;
482 int r;
483
484 assert(context);
485 assert(params);
486 assert(ident);
487 assert(journal_stream_dev);
488 assert(journal_stream_ino);
489
490 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
491
492 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
493 return -errno;
494
495 return STDOUT_FILENO;
496 }
497
498 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
499 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
500 return -errno;
501
502 return STDERR_FILENO;
503 }
504
505 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
506 o = fixup_output(context->std_output, socket_fd);
507
508 if (fileno == STDERR_FILENO) {
509 ExecOutput e;
510 e = fixup_output(context->std_error, socket_fd);
511
512 /* This expects the input and output are already set up */
513
514 /* Don't change the stderr file descriptor if we inherit all
515 * the way and are not on a tty */
516 if (e == EXEC_OUTPUT_INHERIT &&
517 o == EXEC_OUTPUT_INHERIT &&
518 i == EXEC_INPUT_NULL &&
519 !is_terminal_input(context->std_input) &&
520 getppid() != 1)
521 return fileno;
522
523 /* Duplicate from stdout if possible */
524 if (can_inherit_stderr_from_stdout(context, o, e))
525 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
526
527 o = e;
528
529 } else if (o == EXEC_OUTPUT_INHERIT) {
530 /* If input got downgraded, inherit the original value */
531 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
532 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
533
534 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
535 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
536 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
537
538 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
539 if (getppid() != 1)
540 return fileno;
541
542 /* We need to open /dev/null here anew, to get the right access mode. */
543 return open_null_as(O_WRONLY, fileno);
544 }
545
546 switch (o) {
547
548 case EXEC_OUTPUT_NULL:
549 return open_null_as(O_WRONLY, fileno);
550
551 case EXEC_OUTPUT_TTY:
552 if (is_terminal_input(i))
553 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
554
555 /* We don't reset the terminal if this is just about output */
556 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
557
558 case EXEC_OUTPUT_KMSG:
559 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
560 case EXEC_OUTPUT_JOURNAL:
561 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
562 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
563 if (r < 0) {
564 log_exec_warning_errno(context,
565 params,
566 r,
567 "Failed to connect %s to the journal socket, ignoring: %m",
568 fileno == STDOUT_FILENO ? "stdout" : "stderr");
569 r = open_null_as(O_WRONLY, fileno);
570 } else {
571 struct stat st;
572
573 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
574 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
575 * services to detect whether they are connected to the journal or not.
576 *
577 * If both stdout and stderr are connected to a stream then let's make sure to store the data
578 * about STDERR as that's usually the best way to do logging. */
579
580 if (fstat(fileno, &st) >= 0 &&
581 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
582 *journal_stream_dev = st.st_dev;
583 *journal_stream_ino = st.st_ino;
584 }
585 }
586 return r;
587
588 case EXEC_OUTPUT_SOCKET:
589 assert(socket_fd >= 0);
590
591 return RET_NERRNO(dup2(socket_fd, fileno));
592
593 case EXEC_OUTPUT_NAMED_FD:
594 assert(named_iofds[fileno] >= 0);
595
596 (void) fd_nonblock(named_iofds[fileno], false);
597 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
598
599 case EXEC_OUTPUT_FILE:
600 case EXEC_OUTPUT_FILE_APPEND:
601 case EXEC_OUTPUT_FILE_TRUNCATE: {
602 bool rw;
603 int fd, flags;
604
605 assert(context->stdio_file[fileno]);
606
607 rw = context->std_input == EXEC_INPUT_FILE &&
608 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
609
610 if (rw)
611 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
612
613 flags = O_WRONLY;
614 if (o == EXEC_OUTPUT_FILE_APPEND)
615 flags |= O_APPEND;
616 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
617 flags |= O_TRUNC;
618
619 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
620 if (fd < 0)
621 return fd;
622
623 return move_fd(fd, fileno, 0);
624 }
625
626 default:
627 assert_not_reached();
628 }
629 }
630
631 static int chown_terminal(int fd, uid_t uid) {
632 int r;
633
634 assert(fd >= 0);
635
636 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
637 if (!isatty_safe(fd))
638 return 0;
639
640 /* This might fail. What matters are the results. */
641 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
642 if (r < 0)
643 return r;
644
645 return 1;
646 }
647
648 static int setup_confirm_stdio(
649 const ExecContext *context,
650 const char *vc,
651 int *ret_saved_stdin,
652 int *ret_saved_stdout) {
653
654 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
655 int r;
656
657 assert(ret_saved_stdin);
658 assert(ret_saved_stdout);
659
660 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
661 if (saved_stdin < 0)
662 return -errno;
663
664 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
665 if (saved_stdout < 0)
666 return -errno;
667
668 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
669 if (fd < 0)
670 return fd;
671
672 r = chown_terminal(fd, getuid());
673 if (r < 0)
674 return r;
675
676 r = reset_terminal_fd(fd, /* switch_to_text= */ true);
677 if (r < 0)
678 return r;
679
680 r = exec_context_apply_tty_size(context, fd, vc);
681 if (r < 0)
682 return r;
683
684 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
685 TAKE_FD(fd);
686 if (r < 0)
687 return r;
688
689 *ret_saved_stdin = TAKE_FD(saved_stdin);
690 *ret_saved_stdout = TAKE_FD(saved_stdout);
691 return 0;
692 }
693
694 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
695 assert(err < 0);
696 assert(unit_id);
697
698 if (err == -ETIMEDOUT)
699 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
700 else {
701 errno = -err;
702 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
703 }
704 }
705
706 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
707 _cleanup_close_ int fd = -EBADF;
708
709 assert(vc);
710
711 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
712 if (fd < 0)
713 return;
714
715 write_confirm_error_fd(err, fd, unit_id);
716 }
717
718 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
719 int r = 0;
720
721 assert(saved_stdin);
722 assert(saved_stdout);
723
724 release_terminal();
725
726 if (*saved_stdin >= 0)
727 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
728 r = -errno;
729
730 if (*saved_stdout >= 0)
731 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
732 r = -errno;
733
734 *saved_stdin = safe_close(*saved_stdin);
735 *saved_stdout = safe_close(*saved_stdout);
736
737 return r;
738 }
739
740 enum {
741 CONFIRM_PRETEND_FAILURE = -1,
742 CONFIRM_PRETEND_SUCCESS = 0,
743 CONFIRM_EXECUTE = 1,
744 };
745
746 static bool confirm_spawn_disabled(void) {
747 return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
748 }
749
750 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
751 int saved_stdout = -1, saved_stdin = -1, r;
752 _cleanup_free_ char *e = NULL;
753 char c;
754
755 assert(context);
756 assert(params);
757
758 /* For any internal errors, assume a positive response. */
759 r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
760 if (r < 0) {
761 write_confirm_error(r, params->confirm_spawn, params->unit_id);
762 return CONFIRM_EXECUTE;
763 }
764
765 /* confirm_spawn might have been disabled while we were sleeping. */
766 if (!params->confirm_spawn || confirm_spawn_disabled()) {
767 r = 1;
768 goto restore_stdio;
769 }
770
771 e = ellipsize(cmdline, 60, 100);
772 if (!e) {
773 log_oom();
774 r = CONFIRM_EXECUTE;
775 goto restore_stdio;
776 }
777
778 for (;;) {
779 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
780 if (r < 0) {
781 write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
782 r = CONFIRM_EXECUTE;
783 goto restore_stdio;
784 }
785
786 switch (c) {
787 case 'c':
788 printf("Resuming normal execution.\n");
789 manager_disable_confirm_spawn();
790 r = 1;
791 break;
792 case 'D':
793 printf(" Unit: %s\n",
794 params->unit_id);
795 exec_context_dump(context, stdout, " ");
796 exec_params_dump(params, stdout, " ");
797 continue; /* ask again */
798 case 'f':
799 printf("Failing execution.\n");
800 r = CONFIRM_PRETEND_FAILURE;
801 break;
802 case 'h':
803 printf(" c - continue, proceed without asking anymore\n"
804 " D - dump, show the state of the unit\n"
805 " f - fail, don't execute the command and pretend it failed\n"
806 " h - help\n"
807 " i - info, show a short summary of the unit\n"
808 " j - jobs, show jobs that are in progress\n"
809 " s - skip, don't execute the command and pretend it succeeded\n"
810 " y - yes, execute the command\n");
811 continue; /* ask again */
812 case 'i':
813 printf(" Unit: %s\n"
814 " Command: %s\n",
815 params->unit_id, cmdline);
816 continue; /* ask again */
817 case 'j':
818 if (sigqueue(getppid(),
819 SIGRTMIN+18,
820 (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
821 return -errno;
822
823 continue; /* ask again */
824 case 'n':
825 /* 'n' was removed in favor of 'f'. */
826 printf("Didn't understand 'n', did you mean 'f'?\n");
827 continue; /* ask again */
828 case 's':
829 printf("Skipping execution.\n");
830 r = CONFIRM_PRETEND_SUCCESS;
831 break;
832 case 'y':
833 r = CONFIRM_EXECUTE;
834 break;
835 default:
836 assert_not_reached();
837 }
838 break;
839 }
840
841 restore_stdio:
842 restore_confirm_stdio(&saved_stdin, &saved_stdout);
843 return r;
844 }
845
846 static int get_fixed_user(
847 const char *user_or_uid,
848 const char **ret_username,
849 uid_t *ret_uid,
850 gid_t *ret_gid,
851 const char **ret_home,
852 const char **ret_shell) {
853
854 int r;
855
856 assert(user_or_uid);
857 assert(ret_username);
858
859 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
860 * (i.e. are "/" or "/bin/nologin"). */
861
862 r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
863 if (r < 0)
864 return r;
865
866 /* user_or_uid is normalized by get_user_creds to username */
867 *ret_username = user_or_uid;
868
869 return 0;
870 }
871
872 static int get_fixed_group(
873 const char *group_or_gid,
874 const char **ret_groupname,
875 gid_t *ret_gid) {
876
877 int r;
878
879 assert(group_or_gid);
880 assert(ret_groupname);
881
882 r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
883 if (r < 0)
884 return r;
885
886 /* group_or_gid is normalized by get_group_creds to groupname */
887 *ret_groupname = group_or_gid;
888
889 return 0;
890 }
891
892 static int get_supplementary_groups(const ExecContext *c, const char *user,
893 const char *group, gid_t gid,
894 gid_t **supplementary_gids, int *ngids) {
895 int r, k = 0;
896 int ngroups_max;
897 bool keep_groups = false;
898 gid_t *groups = NULL;
899 _cleanup_free_ gid_t *l_gids = NULL;
900
901 assert(c);
902
903 /*
904 * If user is given, then lookup GID and supplementary groups list.
905 * We avoid NSS lookups for gid=0. Also we have to initialize groups
906 * here and as early as possible so we keep the list of supplementary
907 * groups of the caller.
908 */
909 if (user && gid_is_valid(gid) && gid != 0) {
910 /* First step, initialize groups from /etc/groups */
911 if (initgroups(user, gid) < 0)
912 return -errno;
913
914 keep_groups = true;
915 }
916
917 if (strv_isempty(c->supplementary_groups))
918 return 0;
919
920 /*
921 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
922 * be positive, otherwise fail.
923 */
924 errno = 0;
925 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
926 if (ngroups_max <= 0)
927 return errno_or_else(EOPNOTSUPP);
928
929 l_gids = new(gid_t, ngroups_max);
930 if (!l_gids)
931 return -ENOMEM;
932
933 if (keep_groups) {
934 /*
935 * Lookup the list of groups that the user belongs to, we
936 * avoid NSS lookups here too for gid=0.
937 */
938 k = ngroups_max;
939 if (getgrouplist(user, gid, l_gids, &k) < 0)
940 return -EINVAL;
941 } else
942 k = 0;
943
944 STRV_FOREACH(i, c->supplementary_groups) {
945 const char *g;
946
947 if (k >= ngroups_max)
948 return -E2BIG;
949
950 g = *i;
951 r = get_group_creds(&g, l_gids+k, 0);
952 if (r < 0)
953 return r;
954
955 k++;
956 }
957
958 /*
959 * Sets ngids to zero to drop all supplementary groups, happens
960 * when we are under root and SupplementaryGroups= is empty.
961 */
962 if (k == 0) {
963 *ngids = 0;
964 return 0;
965 }
966
967 /* Otherwise get the final list of supplementary groups */
968 groups = memdup(l_gids, sizeof(gid_t) * k);
969 if (!groups)
970 return -ENOMEM;
971
972 *supplementary_gids = groups;
973 *ngids = k;
974
975 groups = NULL;
976
977 return 0;
978 }
979
980 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
981 int r;
982
983 /* Handle SupplementaryGroups= if it is not empty */
984 if (ngids > 0) {
985 r = maybe_setgroups(ngids, supplementary_gids);
986 if (r < 0)
987 return r;
988 }
989
990 if (gid_is_valid(gid)) {
991 /* Then set our gids */
992 if (setresgid(gid, gid, gid) < 0)
993 return -errno;
994 }
995
996 return 0;
997 }
998
999 static int set_securebits(unsigned bits, unsigned mask) {
1000 unsigned applied;
1001 int current;
1002
1003 current = prctl(PR_GET_SECUREBITS);
1004 if (current < 0)
1005 return -errno;
1006
1007 /* Clear all securebits defined in mask and set bits */
1008 applied = ((unsigned) current & ~mask) | bits;
1009 if ((unsigned) current == applied)
1010 return 0;
1011
1012 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1013 return -errno;
1014
1015 return 1;
1016 }
1017
1018 static int enforce_user(
1019 const ExecContext *context,
1020 uid_t uid,
1021 uint64_t capability_ambient_set) {
1022 assert(context);
1023 int r;
1024
1025 if (!uid_is_valid(uid))
1026 return 0;
1027
1028 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1029 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1030 * case. */
1031
1032 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1033
1034 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1035 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1036 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1037 if (r < 0)
1038 return r;
1039 }
1040
1041 /* Second step: actually set the uids */
1042 if (setresuid(uid, uid, uid) < 0)
1043 return -errno;
1044
1045 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1046 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1047 * outside of this call. */
1048 return 0;
1049 }
1050
1051 #if HAVE_PAM
1052
1053 static int null_conv(
1054 int num_msg,
1055 const struct pam_message **msg,
1056 struct pam_response **resp,
1057 void *appdata_ptr) {
1058
1059 /* We don't support conversations */
1060
1061 return PAM_CONV_ERR;
1062 }
1063
1064 static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
1065 int r, s;
1066
1067 assert(handle);
1068
1069 r = pam_close_session(handle, flags);
1070 if (r != PAM_SUCCESS)
1071 log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
1072
1073 s = pam_setcred(handle, PAM_DELETE_CRED | flags);
1074 if (s != PAM_SUCCESS)
1075 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
1076
1077 return r != PAM_SUCCESS ? r : s;
1078 }
1079
1080 #endif
1081
1082 static int setup_pam(
1083 const char *name,
1084 const char *user,
1085 uid_t uid,
1086 gid_t gid,
1087 const char *tty,
1088 char ***env, /* updated on success */
1089 const int fds[], size_t n_fds,
1090 int exec_fd) {
1091
1092 #if HAVE_PAM
1093
1094 static const struct pam_conv conv = {
1095 .conv = null_conv,
1096 .appdata_ptr = NULL
1097 };
1098
1099 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1100 _cleanup_strv_free_ char **e = NULL;
1101 pam_handle_t *handle = NULL;
1102 sigset_t old_ss;
1103 int pam_code = PAM_SUCCESS, r;
1104 bool close_session = false;
1105 pid_t parent_pid;
1106 int flags = 0;
1107
1108 assert(name);
1109 assert(user);
1110 assert(env);
1111
1112 /* We set up PAM in the parent process, then fork. The child
1113 * will then stay around until killed via PR_GET_PDEATHSIG or
1114 * systemd via the cgroup logic. It will then remove the PAM
1115 * session again. The parent process will exec() the actual
1116 * daemon. We do things this way to ensure that the main PID
1117 * of the daemon is the one we initially fork()ed. */
1118
1119 r = barrier_create(&barrier);
1120 if (r < 0)
1121 goto fail;
1122
1123 if (log_get_max_level() < LOG_DEBUG)
1124 flags |= PAM_SILENT;
1125
1126 pam_code = pam_start(name, user, &conv, &handle);
1127 if (pam_code != PAM_SUCCESS) {
1128 handle = NULL;
1129 goto fail;
1130 }
1131
1132 if (!tty) {
1133 _cleanup_free_ char *q = NULL;
1134
1135 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1136 * out if that's the case, and read the TTY off it. */
1137
1138 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1139 tty = strjoina("/dev/", q);
1140 }
1141
1142 if (tty) {
1143 pam_code = pam_set_item(handle, PAM_TTY, tty);
1144 if (pam_code != PAM_SUCCESS)
1145 goto fail;
1146 }
1147
1148 STRV_FOREACH(nv, *env) {
1149 pam_code = pam_putenv(handle, *nv);
1150 if (pam_code != PAM_SUCCESS)
1151 goto fail;
1152 }
1153
1154 pam_code = pam_acct_mgmt(handle, flags);
1155 if (pam_code != PAM_SUCCESS)
1156 goto fail;
1157
1158 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1159 if (pam_code != PAM_SUCCESS)
1160 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
1161
1162 pam_code = pam_open_session(handle, flags);
1163 if (pam_code != PAM_SUCCESS)
1164 goto fail;
1165
1166 close_session = true;
1167
1168 e = pam_getenvlist(handle);
1169 if (!e) {
1170 pam_code = PAM_BUF_ERR;
1171 goto fail;
1172 }
1173
1174 /* Block SIGTERM, so that we know that it won't get lost in the child */
1175
1176 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
1177
1178 parent_pid = getpid_cached();
1179
1180 r = safe_fork("(sd-pam)", 0, NULL);
1181 if (r < 0)
1182 goto fail;
1183 if (r == 0) {
1184 int ret = EXIT_PAM;
1185
1186 /* The child's job is to reset the PAM session on termination */
1187 barrier_set_role(&barrier, BARRIER_CHILD);
1188
1189 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1190 * those fds are open here that have been opened by PAM. */
1191 (void) close_many(fds, n_fds);
1192
1193 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1194 * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1195 * we'd never signal completion. */
1196 exec_fd = safe_close(exec_fd);
1197
1198 /* Drop privileges - we don't need any to pam_close_session and this will make
1199 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1200 * threads to fail to exit normally */
1201
1202 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1203 if (r < 0)
1204 log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1205
1206 (void) ignore_signals(SIGPIPE);
1207
1208 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1209 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1210 * this way. We rely on the control groups kill logic to do the rest for us. */
1211 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1212 goto child_finish;
1213
1214 /* Tell the parent that our setup is done. This is especially important regarding dropping
1215 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1216 *
1217 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1218 (void) barrier_place(&barrier);
1219
1220 /* Check if our parent process might already have died? */
1221 if (getppid() == parent_pid) {
1222 sigset_t ss;
1223 int sig;
1224
1225 assert_se(sigemptyset(&ss) >= 0);
1226 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1227
1228 assert_se(sigwait(&ss, &sig) == 0);
1229 assert(sig == SIGTERM);
1230 }
1231
1232 /* If our parent died we'll end the session */
1233 if (getppid() != parent_pid) {
1234 pam_code = pam_close_session_and_delete_credentials(handle, flags);
1235 if (pam_code != PAM_SUCCESS)
1236 goto child_finish;
1237 }
1238
1239 ret = 0;
1240
1241 child_finish:
1242 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1243 * know about this. See pam_end(3) */
1244 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1245 _exit(ret);
1246 }
1247
1248 barrier_set_role(&barrier, BARRIER_PARENT);
1249
1250 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1251 * here. */
1252 handle = NULL;
1253
1254 /* Unblock SIGTERM again in the parent */
1255 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1256
1257 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1258 * this fd around. */
1259 closelog();
1260
1261 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1262 * recover. However, warn loudly if it happens. */
1263 if (!barrier_place_and_sync(&barrier))
1264 log_error("PAM initialization failed");
1265
1266 return strv_free_and_replace(*env, e);
1267
1268 fail:
1269 if (pam_code != PAM_SUCCESS) {
1270 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1271 r = -EPERM; /* PAM errors do not map to errno */
1272 } else
1273 log_error_errno(r, "PAM failed: %m");
1274
1275 if (handle) {
1276 if (close_session)
1277 pam_code = pam_close_session_and_delete_credentials(handle, flags);
1278
1279 (void) pam_end(handle, pam_code | flags);
1280 }
1281
1282 closelog();
1283 return r;
1284 #else
1285 return 0;
1286 #endif
1287 }
1288
1289 static void rename_process_from_path(const char *path) {
1290 _cleanup_free_ char *buf = NULL;
1291 const char *p;
1292
1293 assert(path);
1294
1295 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1296 * /bin/ps */
1297
1298 if (path_extract_filename(path, &buf) < 0) {
1299 rename_process("(...)");
1300 return;
1301 }
1302
1303 size_t l = strlen(buf);
1304 if (l > 8) {
1305 /* The end of the process name is usually more interesting, since the first bit might just be
1306 * "systemd-" */
1307 p = buf + l - 8;
1308 l = 8;
1309 } else
1310 p = buf;
1311
1312 char process_name[11];
1313 process_name[0] = '(';
1314 memcpy(process_name+1, p, l);
1315 process_name[1+l] = ')';
1316 process_name[1+l+1] = 0;
1317
1318 (void) rename_process(process_name);
1319 }
1320
1321 static bool context_has_address_families(const ExecContext *c) {
1322 assert(c);
1323
1324 return c->address_families_allow_list ||
1325 !set_isempty(c->address_families);
1326 }
1327
1328 static bool context_has_syscall_filters(const ExecContext *c) {
1329 assert(c);
1330
1331 return c->syscall_allow_list ||
1332 !hashmap_isempty(c->syscall_filter);
1333 }
1334
1335 static bool context_has_syscall_logs(const ExecContext *c) {
1336 assert(c);
1337
1338 return c->syscall_log_allow_list ||
1339 !hashmap_isempty(c->syscall_log);
1340 }
1341
1342 static bool context_has_seccomp(const ExecContext *c) {
1343 /* We need NNP if we have any form of seccomp and are unprivileged */
1344 return c->lock_personality ||
1345 c->memory_deny_write_execute ||
1346 c->private_devices ||
1347 c->protect_clock ||
1348 c->protect_hostname ||
1349 c->protect_kernel_tunables ||
1350 c->protect_kernel_modules ||
1351 c->protect_kernel_logs ||
1352 context_has_address_families(c) ||
1353 exec_context_restrict_namespaces_set(c) ||
1354 c->restrict_realtime ||
1355 c->restrict_suid_sgid ||
1356 !set_isempty(c->syscall_archs) ||
1357 context_has_syscall_filters(c) ||
1358 context_has_syscall_logs(c);
1359 }
1360
1361 static bool context_has_no_new_privileges(const ExecContext *c) {
1362 assert(c);
1363
1364 if (c->no_new_privileges)
1365 return true;
1366
1367 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1368 return false;
1369
1370 return context_has_seccomp(c);
1371 }
1372
1373 #if HAVE_SECCOMP
1374
1375 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1376 void *id, *val;
1377 bool has_capget = false, has_capset = false, has_prctl = false;
1378
1379 assert(c);
1380
1381 /* No syscall filter, we are allowed to drop privileges */
1382 if (hashmap_isempty(c->syscall_filter))
1383 return true;
1384
1385 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1386 _cleanup_free_ char *name = NULL;
1387
1388 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1389
1390 if (streq(name, "capget"))
1391 has_capget = true;
1392 else if (streq(name, "capset"))
1393 has_capset = true;
1394 else if (streq(name, "prctl"))
1395 has_prctl = true;
1396 }
1397
1398 if (c->syscall_allow_list)
1399 return has_capget && has_capset && has_prctl;
1400 else
1401 return !(has_capget || has_capset || has_prctl);
1402 }
1403
1404 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1405
1406 if (is_seccomp_available())
1407 return false;
1408
1409 log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1410 return true;
1411 }
1412
1413 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1414 uint32_t negative_action, default_action, action;
1415 int r;
1416
1417 assert(c);
1418 assert(p);
1419
1420 if (!context_has_syscall_filters(c))
1421 return 0;
1422
1423 if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1424 return 0;
1425
1426 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1427
1428 if (c->syscall_allow_list) {
1429 default_action = negative_action;
1430 action = SCMP_ACT_ALLOW;
1431 } else {
1432 default_action = SCMP_ACT_ALLOW;
1433 action = negative_action;
1434 }
1435
1436 if (needs_ambient_hack) {
1437 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1438 if (r < 0)
1439 return r;
1440 }
1441
1442 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1443 }
1444
1445 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1446 #ifdef SCMP_ACT_LOG
1447 uint32_t default_action, action;
1448 #endif
1449
1450 assert(c);
1451 assert(p);
1452
1453 if (!context_has_syscall_logs(c))
1454 return 0;
1455
1456 #ifdef SCMP_ACT_LOG
1457 if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1458 return 0;
1459
1460 if (c->syscall_log_allow_list) {
1461 /* Log nothing but the ones listed */
1462 default_action = SCMP_ACT_ALLOW;
1463 action = SCMP_ACT_LOG;
1464 } else {
1465 /* Log everything but the ones listed */
1466 default_action = SCMP_ACT_LOG;
1467 action = SCMP_ACT_ALLOW;
1468 }
1469
1470 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1471 #else
1472 /* old libseccomp */
1473 log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1474 return 0;
1475 #endif
1476 }
1477
1478 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1479 assert(c);
1480 assert(p);
1481
1482 if (set_isempty(c->syscall_archs))
1483 return 0;
1484
1485 if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1486 return 0;
1487
1488 return seccomp_restrict_archs(c->syscall_archs);
1489 }
1490
1491 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1492 assert(c);
1493 assert(p);
1494
1495 if (!context_has_address_families(c))
1496 return 0;
1497
1498 if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1499 return 0;
1500
1501 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1502 }
1503
1504 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1505 int r;
1506
1507 assert(c);
1508 assert(p);
1509
1510 if (!c->memory_deny_write_execute)
1511 return 0;
1512
1513 /* use prctl() if kernel supports it (6.3) */
1514 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1515 if (r == 0) {
1516 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1517 return 0;
1518 }
1519 if (r < 0 && errno != EINVAL)
1520 return log_exec_debug_errno(c,
1521 p,
1522 errno,
1523 "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1524 /* else use seccomp */
1525 log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1526
1527 if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1528 return 0;
1529
1530 return seccomp_memory_deny_write_execute();
1531 }
1532
1533 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1534 assert(c);
1535 assert(p);
1536
1537 if (!c->restrict_realtime)
1538 return 0;
1539
1540 if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1541 return 0;
1542
1543 return seccomp_restrict_realtime();
1544 }
1545
1546 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1547 assert(c);
1548 assert(p);
1549
1550 if (!c->restrict_suid_sgid)
1551 return 0;
1552
1553 if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1554 return 0;
1555
1556 return seccomp_restrict_suid_sgid();
1557 }
1558
1559 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1560 assert(c);
1561 assert(p);
1562
1563 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1564 * let's protect even those systems where this is left on in the kernel. */
1565
1566 if (!c->protect_kernel_tunables)
1567 return 0;
1568
1569 if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1570 return 0;
1571
1572 return seccomp_protect_sysctl();
1573 }
1574
1575 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1576 assert(c);
1577 assert(p);
1578
1579 /* Turn off module syscalls on ProtectKernelModules=yes */
1580
1581 if (!c->protect_kernel_modules)
1582 return 0;
1583
1584 if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1585 return 0;
1586
1587 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1588 }
1589
1590 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1591 assert(c);
1592 assert(p);
1593
1594 if (!c->protect_kernel_logs)
1595 return 0;
1596
1597 if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1598 return 0;
1599
1600 return seccomp_protect_syslog();
1601 }
1602
1603 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1604 assert(c);
1605 assert(p);
1606
1607 if (!c->protect_clock)
1608 return 0;
1609
1610 if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1611 return 0;
1612
1613 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1614 }
1615
1616 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1617 assert(c);
1618 assert(p);
1619
1620 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1621
1622 if (!c->private_devices)
1623 return 0;
1624
1625 if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1626 return 0;
1627
1628 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1629 }
1630
1631 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1632 assert(c);
1633 assert(p);
1634
1635 if (!exec_context_restrict_namespaces_set(c))
1636 return 0;
1637
1638 if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1639 return 0;
1640
1641 return seccomp_restrict_namespaces(c->restrict_namespaces);
1642 }
1643
1644 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1645 unsigned long personality;
1646 int r;
1647
1648 assert(c);
1649 assert(p);
1650
1651 if (!c->lock_personality)
1652 return 0;
1653
1654 if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1655 return 0;
1656
1657 personality = c->personality;
1658
1659 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1660 if (personality == PERSONALITY_INVALID) {
1661
1662 r = opinionated_personality(&personality);
1663 if (r < 0)
1664 return r;
1665 }
1666
1667 return seccomp_lock_personality(personality);
1668 }
1669
1670 #endif
1671
1672 #if HAVE_LIBBPF
1673 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1674 int r;
1675
1676 assert(c);
1677 assert(p);
1678
1679 if (!exec_context_restrict_filesystems_set(c))
1680 return 0;
1681
1682 if (p->bpf_restrict_fs_map_fd < 0) {
1683 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1684 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1685 return 0;
1686 }
1687
1688 /* We are in a new binary, so dl-open again */
1689 r = dlopen_bpf();
1690 if (r < 0)
1691 return r;
1692
1693 return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
1694 }
1695 #endif
1696
1697 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1698 assert(c);
1699 assert(p);
1700
1701 if (!c->protect_hostname)
1702 return 0;
1703
1704 if (ns_type_supported(NAMESPACE_UTS)) {
1705 if (unshare(CLONE_NEWUTS) < 0) {
1706 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1707 *ret_exit_status = EXIT_NAMESPACE;
1708 return log_exec_error_errno(c,
1709 p,
1710 errno,
1711 "Failed to set up UTS namespacing: %m");
1712 }
1713
1714 log_exec_warning(c,
1715 p,
1716 "ProtectHostname=yes is configured, but UTS namespace setup is "
1717 "prohibited (container manager?), ignoring namespace setup.");
1718 }
1719 } else
1720 log_exec_warning(c,
1721 p,
1722 "ProtectHostname=yes is configured, but the kernel does not "
1723 "support UTS namespaces, ignoring namespace setup.");
1724
1725 #if HAVE_SECCOMP
1726 int r;
1727
1728 if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1729 return 0;
1730
1731 r = seccomp_protect_hostname();
1732 if (r < 0) {
1733 *ret_exit_status = EXIT_SECCOMP;
1734 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1735 }
1736 #endif
1737
1738 return 0;
1739 }
1740
1741 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1742 assert(idle_pipe);
1743
1744 idle_pipe[1] = safe_close(idle_pipe[1]);
1745 idle_pipe[2] = safe_close(idle_pipe[2]);
1746
1747 if (idle_pipe[0] >= 0) {
1748 int r;
1749
1750 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1751
1752 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1753 ssize_t n;
1754
1755 /* Signal systemd that we are bored and want to continue. */
1756 n = write(idle_pipe[3], "x", 1);
1757 if (n > 0)
1758 /* Wait for systemd to react to the signal above. */
1759 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1760 }
1761
1762 idle_pipe[0] = safe_close(idle_pipe[0]);
1763
1764 }
1765
1766 idle_pipe[3] = safe_close(idle_pipe[3]);
1767 }
1768
1769 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1770
1771 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1772 * the service payload in. */
1773 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1774 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1775 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1776 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1777 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1778 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1779 };
1780
1781 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1782
1783 static int build_environment(
1784 const ExecContext *c,
1785 const ExecParameters *p,
1786 const CGroupContext *cgroup_context,
1787 size_t n_fds,
1788 const char *home,
1789 const char *username,
1790 const char *shell,
1791 dev_t journal_stream_dev,
1792 ino_t journal_stream_ino,
1793 const char *memory_pressure_path,
1794 char ***ret) {
1795
1796 _cleanup_strv_free_ char **our_env = NULL;
1797 size_t n_env = 0;
1798 char *x;
1799 int r;
1800
1801 assert(c);
1802 assert(p);
1803 assert(ret);
1804
1805 #define N_ENV_VARS 19
1806 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1807 if (!our_env)
1808 return -ENOMEM;
1809
1810 if (n_fds > 0) {
1811 _cleanup_free_ char *joined = NULL;
1812
1813 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1814 return -ENOMEM;
1815 our_env[n_env++] = x;
1816
1817 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1818 return -ENOMEM;
1819 our_env[n_env++] = x;
1820
1821 joined = strv_join(p->fd_names, ":");
1822 if (!joined)
1823 return -ENOMEM;
1824
1825 x = strjoin("LISTEN_FDNAMES=", joined);
1826 if (!x)
1827 return -ENOMEM;
1828 our_env[n_env++] = x;
1829 }
1830
1831 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1832 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1833 return -ENOMEM;
1834 our_env[n_env++] = x;
1835
1836 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1837 return -ENOMEM;
1838 our_env[n_env++] = x;
1839 }
1840
1841 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1842 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1843 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1844 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1845 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1846 if (!x)
1847 return -ENOMEM;
1848 our_env[n_env++] = x;
1849 }
1850
1851 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1852 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1853 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1854 * SetLoginEnvironment= switch. */
1855 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1856 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1857 if (r < 0)
1858 return log_exec_debug_errno(c,
1859 p,
1860 r,
1861 "Failed to determine user credentials for root: %m");
1862 }
1863
1864 bool set_user_login_env = exec_context_get_set_login_environment(c);
1865
1866 if (username) {
1867 x = strjoin("USER=", username);
1868 if (!x)
1869 return -ENOMEM;
1870 our_env[n_env++] = x;
1871
1872 if (set_user_login_env) {
1873 x = strjoin("LOGNAME=", username);
1874 if (!x)
1875 return -ENOMEM;
1876 our_env[n_env++] = x;
1877 }
1878 }
1879
1880 if (home && set_user_login_env) {
1881 x = strjoin("HOME=", home);
1882 if (!x)
1883 return -ENOMEM;
1884
1885 path_simplify(x + 5);
1886 our_env[n_env++] = x;
1887 }
1888
1889 if (shell && set_user_login_env) {
1890 x = strjoin("SHELL=", shell);
1891 if (!x)
1892 return -ENOMEM;
1893
1894 path_simplify(x + 6);
1895 our_env[n_env++] = x;
1896 }
1897
1898 if (!sd_id128_is_null(p->invocation_id)) {
1899 assert(p->invocation_id_string);
1900
1901 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1902 if (!x)
1903 return -ENOMEM;
1904
1905 our_env[n_env++] = x;
1906 }
1907
1908 if (exec_context_needs_term(c)) {
1909 _cleanup_free_ char *cmdline = NULL;
1910 const char *tty_path, *term = NULL;
1911
1912 tty_path = exec_context_tty_path(c);
1913
1914 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1915 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1916 * container manager passes to PID 1 ends up all the way in the console login shown. */
1917
1918 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1919 term = getenv("TERM");
1920 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1921 _cleanup_free_ char *key = NULL;
1922
1923 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1924 if (!key)
1925 return -ENOMEM;
1926
1927 r = proc_cmdline_get_key(key, 0, &cmdline);
1928 if (r < 0)
1929 log_exec_debug_errno(c,
1930 p,
1931 r,
1932 "Failed to read %s from kernel cmdline, ignoring: %m",
1933 key);
1934 else if (r > 0)
1935 term = cmdline;
1936 }
1937
1938 if (!term)
1939 term = default_term_for_tty(tty_path);
1940
1941 x = strjoin("TERM=", term);
1942 if (!x)
1943 return -ENOMEM;
1944 our_env[n_env++] = x;
1945 }
1946
1947 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1948 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1949 return -ENOMEM;
1950
1951 our_env[n_env++] = x;
1952 }
1953
1954 if (c->log_namespace) {
1955 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1956 if (!x)
1957 return -ENOMEM;
1958
1959 our_env[n_env++] = x;
1960 }
1961
1962 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1963 _cleanup_free_ char *joined = NULL;
1964 const char *n;
1965
1966 if (!p->prefix[t])
1967 continue;
1968
1969 if (c->directories[t].n_items == 0)
1970 continue;
1971
1972 n = exec_directory_env_name_to_string(t);
1973 if (!n)
1974 continue;
1975
1976 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1977 _cleanup_free_ char *prefixed = NULL;
1978
1979 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1980 if (!prefixed)
1981 return -ENOMEM;
1982
1983 if (!strextend_with_separator(&joined, ":", prefixed))
1984 return -ENOMEM;
1985 }
1986
1987 x = strjoin(n, "=", joined);
1988 if (!x)
1989 return -ENOMEM;
1990
1991 our_env[n_env++] = x;
1992 }
1993
1994 _cleanup_free_ char *creds_dir = NULL;
1995 r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
1996 if (r < 0)
1997 return r;
1998 if (r > 0) {
1999 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2000 if (!x)
2001 return -ENOMEM;
2002
2003 our_env[n_env++] = x;
2004 }
2005
2006 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2007 return -ENOMEM;
2008
2009 our_env[n_env++] = x;
2010
2011 if (memory_pressure_path) {
2012 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2013 if (!x)
2014 return -ENOMEM;
2015
2016 our_env[n_env++] = x;
2017
2018 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2019 _cleanup_free_ char *b = NULL, *e = NULL;
2020
2021 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2022 MEMORY_PRESSURE_DEFAULT_TYPE,
2023 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2024 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2025 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2026 return -ENOMEM;
2027
2028 if (base64mem(b, strlen(b) + 1, &e) < 0)
2029 return -ENOMEM;
2030
2031 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2032 if (!x)
2033 return -ENOMEM;
2034
2035 our_env[n_env++] = x;
2036 }
2037 }
2038
2039 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2040 #undef N_ENV_VARS
2041
2042 *ret = TAKE_PTR(our_env);
2043
2044 return 0;
2045 }
2046
2047 static int build_pass_environment(const ExecContext *c, char ***ret) {
2048 _cleanup_strv_free_ char **pass_env = NULL;
2049 size_t n_env = 0;
2050
2051 STRV_FOREACH(i, c->pass_environment) {
2052 _cleanup_free_ char *x = NULL;
2053 char *v;
2054
2055 v = getenv(*i);
2056 if (!v)
2057 continue;
2058 x = strjoin(*i, "=", v);
2059 if (!x)
2060 return -ENOMEM;
2061
2062 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2063 return -ENOMEM;
2064
2065 pass_env[n_env++] = TAKE_PTR(x);
2066 pass_env[n_env] = NULL;
2067 }
2068
2069 *ret = TAKE_PTR(pass_env);
2070
2071 return 0;
2072 }
2073
2074 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2075 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2076 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2077 _cleanup_close_ int unshare_ready_fd = -EBADF;
2078 _cleanup_(sigkill_waitp) pid_t pid = 0;
2079 uint64_t c = 1;
2080 ssize_t n;
2081 int r;
2082
2083 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2084 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2085 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2086 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2087 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2088 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2089 * continues execution normally.
2090 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2091 * does not need CAP_SETUID to write the single line mapping to itself. */
2092
2093 /* Can only set up multiple mappings with CAP_SETUID. */
2094 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2095 r = asprintf(&uid_map,
2096 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2097 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2098 ouid, ouid, uid, uid);
2099 else
2100 r = asprintf(&uid_map,
2101 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2102 ouid, ouid);
2103
2104 if (r < 0)
2105 return -ENOMEM;
2106
2107 /* Can only set up multiple mappings with CAP_SETGID. */
2108 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2109 r = asprintf(&gid_map,
2110 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2111 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2112 ogid, ogid, gid, gid);
2113 else
2114 r = asprintf(&gid_map,
2115 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2116 ogid, ogid);
2117
2118 if (r < 0)
2119 return -ENOMEM;
2120
2121 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2122 * namespace. */
2123 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2124 if (unshare_ready_fd < 0)
2125 return -errno;
2126
2127 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2128 * failed. */
2129 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2130 return -errno;
2131
2132 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2133 if (r < 0)
2134 return r;
2135 if (r == 0) {
2136 _cleanup_close_ int fd = -EBADF;
2137 const char *a;
2138 pid_t ppid;
2139
2140 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2141 * here, after the parent opened its own user namespace. */
2142
2143 ppid = getppid();
2144 errno_pipe[0] = safe_close(errno_pipe[0]);
2145
2146 /* Wait until the parent unshared the user namespace */
2147 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2148 r = -errno;
2149 goto child_fail;
2150 }
2151
2152 /* Disable the setgroups() system call in the child user namespace, for good. */
2153 a = procfs_file_alloca(ppid, "setgroups");
2154 fd = open(a, O_WRONLY|O_CLOEXEC);
2155 if (fd < 0) {
2156 if (errno != ENOENT) {
2157 r = -errno;
2158 goto child_fail;
2159 }
2160
2161 /* If the file is missing the kernel is too old, let's continue anyway. */
2162 } else {
2163 if (write(fd, "deny\n", 5) < 0) {
2164 r = -errno;
2165 goto child_fail;
2166 }
2167
2168 fd = safe_close(fd);
2169 }
2170
2171 /* First write the GID map */
2172 a = procfs_file_alloca(ppid, "gid_map");
2173 fd = open(a, O_WRONLY|O_CLOEXEC);
2174 if (fd < 0) {
2175 r = -errno;
2176 goto child_fail;
2177 }
2178 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2179 r = -errno;
2180 goto child_fail;
2181 }
2182 fd = safe_close(fd);
2183
2184 /* The write the UID map */
2185 a = procfs_file_alloca(ppid, "uid_map");
2186 fd = open(a, O_WRONLY|O_CLOEXEC);
2187 if (fd < 0) {
2188 r = -errno;
2189 goto child_fail;
2190 }
2191 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2192 r = -errno;
2193 goto child_fail;
2194 }
2195
2196 _exit(EXIT_SUCCESS);
2197
2198 child_fail:
2199 (void) write(errno_pipe[1], &r, sizeof(r));
2200 _exit(EXIT_FAILURE);
2201 }
2202
2203 errno_pipe[1] = safe_close(errno_pipe[1]);
2204
2205 if (unshare(CLONE_NEWUSER) < 0)
2206 return -errno;
2207
2208 /* Let the child know that the namespace is ready now */
2209 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2210 return -errno;
2211
2212 /* Try to read an error code from the child */
2213 n = read(errno_pipe[0], &r, sizeof(r));
2214 if (n < 0)
2215 return -errno;
2216 if (n == sizeof(r)) { /* an error code was sent to us */
2217 if (r < 0)
2218 return r;
2219 return -EIO;
2220 }
2221 if (n != 0) /* on success we should have read 0 bytes */
2222 return -EIO;
2223
2224 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2225 if (r < 0)
2226 return r;
2227 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2228 return -EIO;
2229
2230 return 0;
2231 }
2232
2233 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2234 _cleanup_free_ char *src_abs = NULL;
2235 int r;
2236
2237 assert(source);
2238
2239 src_abs = path_join(root, source);
2240 if (!src_abs)
2241 return -ENOMEM;
2242
2243 STRV_FOREACH(dst, symlinks) {
2244 _cleanup_free_ char *dst_abs = NULL;
2245
2246 dst_abs = path_join(root, *dst);
2247 if (!dst_abs)
2248 return -ENOMEM;
2249
2250 r = mkdir_parents_label(dst_abs, 0755);
2251 if (r < 0)
2252 return r;
2253
2254 r = symlink_idempotent(src_abs, dst_abs, true);
2255 if (r < 0)
2256 return r;
2257 }
2258
2259 return 0;
2260 }
2261
2262 static int setup_exec_directory(
2263 const ExecContext *context,
2264 const ExecParameters *params,
2265 uid_t uid,
2266 gid_t gid,
2267 ExecDirectoryType type,
2268 bool needs_mount_namespace,
2269 int *exit_status) {
2270
2271 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2272 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2273 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2274 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2275 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2276 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2277 };
2278 int r;
2279
2280 assert(context);
2281 assert(params);
2282 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2283 assert(exit_status);
2284
2285 if (!params->prefix[type])
2286 return 0;
2287
2288 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2289 if (!uid_is_valid(uid))
2290 uid = 0;
2291 if (!gid_is_valid(gid))
2292 gid = 0;
2293 }
2294
2295 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2296 _cleanup_free_ char *p = NULL, *pp = NULL;
2297
2298 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2299 if (!p) {
2300 r = -ENOMEM;
2301 goto fail;
2302 }
2303
2304 r = mkdir_parents_label(p, 0755);
2305 if (r < 0)
2306 goto fail;
2307
2308 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2309
2310 /* If we are in user mode, and a configuration directory exists but a state directory
2311 * doesn't exist, then we likely are upgrading from an older systemd version that
2312 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2313 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2314 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2315 * separated. If a service has both dirs configured but only the configuration dir
2316 * exists and the state dir does not, we assume we are looking at an update
2317 * situation. Hence, create a compatibility symlink, so that all expectations are
2318 * met.
2319 *
2320 * (We also do something similar with the log directory, which still doesn't exist in
2321 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2322
2323 /* this assumes the state dir is always created before the configuration dir */
2324 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2325 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2326
2327 r = laccess(p, F_OK);
2328 if (r == -ENOENT) {
2329 _cleanup_free_ char *q = NULL;
2330
2331 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2332 * under the configuration hierarchy. */
2333
2334 if (type == EXEC_DIRECTORY_STATE)
2335 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2336 else if (type == EXEC_DIRECTORY_LOGS)
2337 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2338 else
2339 assert_not_reached();
2340 if (!q) {
2341 r = -ENOMEM;
2342 goto fail;
2343 }
2344
2345 r = laccess(q, F_OK);
2346 if (r >= 0) {
2347 /* It does exist! This hence looks like an update. Symlink the
2348 * configuration directory into the state directory. */
2349
2350 r = symlink_idempotent(q, p, /* make_relative= */ true);
2351 if (r < 0)
2352 goto fail;
2353
2354 log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2355 continue;
2356 } else if (r != -ENOENT)
2357 log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2358
2359 } else if (r < 0)
2360 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2361 }
2362
2363 if (exec_directory_is_private(context, type)) {
2364 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2365 * case we want to avoid leaving a directory around fully accessible that is owned by
2366 * a dynamic user whose UID is later on reused. To lock this down we use the same
2367 * trick used by container managers to prohibit host users to get access to files of
2368 * the same UID in containers: we place everything inside a directory that has an
2369 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2370 * for unprivileged host code. We then use fs namespacing to make this directory
2371 * permeable for the service itself.
2372 *
2373 * Specifically: for a service which wants a special directory "foo/" we first create
2374 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2375 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2376 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2377 * unprivileged host users can't look into it. Inside of the namespace of the unit
2378 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2379 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2380 * for the service and making sure it only gets access to the dirs it needs but no
2381 * others. Tricky? Yes, absolutely, but it works!
2382 *
2383 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2384 * to be owned by the service itself.
2385 *
2386 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2387 * for sharing files or sockets with other services. */
2388
2389 pp = path_join(params->prefix[type], "private");
2390 if (!pp) {
2391 r = -ENOMEM;
2392 goto fail;
2393 }
2394
2395 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2396 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2397 if (r < 0)
2398 goto fail;
2399
2400 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2401 r = -ENOMEM;
2402 goto fail;
2403 }
2404
2405 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2406 r = mkdir_parents_label(pp, 0755);
2407 if (r < 0)
2408 goto fail;
2409
2410 if (is_dir(p, false) > 0 &&
2411 (laccess(pp, F_OK) == -ENOENT)) {
2412
2413 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2414 * it over. Most likely the service has been upgraded from one that didn't use
2415 * DynamicUser=1, to one that does. */
2416
2417 log_exec_info(context,
2418 params,
2419 "Found pre-existing public %s= directory %s, migrating to %s.\n"
2420 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2421 exec_directory_type_to_string(type), p, pp);
2422
2423 r = RET_NERRNO(rename(p, pp));
2424 if (r < 0)
2425 goto fail;
2426 } else {
2427 /* Otherwise, create the actual directory for the service */
2428
2429 r = mkdir_label(pp, context->directories[type].mode);
2430 if (r < 0 && r != -EEXIST)
2431 goto fail;
2432 }
2433
2434 if (!context->directories[type].items[i].only_create) {
2435 /* And link it up from the original place.
2436 * Notes
2437 * 1) If a mount namespace is going to be used, then this symlink remains on
2438 * the host, and a new one for the child namespace will be created later.
2439 * 2) It is not necessary to create this symlink when one of its parent
2440 * directories is specified and already created. E.g.
2441 * StateDirectory=foo foo/bar
2442 * In that case, the inode points to pp and p for "foo/bar" are the same:
2443 * pp = "/var/lib/private/foo/bar"
2444 * p = "/var/lib/foo/bar"
2445 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2446 * we do not need to create the symlink, but we cannot create the symlink.
2447 * See issue #24783. */
2448 r = symlink_idempotent(pp, p, true);
2449 if (r < 0)
2450 goto fail;
2451 }
2452
2453 } else {
2454 _cleanup_free_ char *target = NULL;
2455
2456 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2457 readlink_and_make_absolute(p, &target) >= 0) {
2458 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2459
2460 /* This already exists and is a symlink? Interesting. Maybe it's one created
2461 * by DynamicUser=1 (see above)?
2462 *
2463 * We do this for all directory types except for ConfigurationDirectory=,
2464 * since they all support the private/ symlink logic at least in some
2465 * configurations, see above. */
2466
2467 r = chase(target, NULL, 0, &target_resolved, NULL);
2468 if (r < 0)
2469 goto fail;
2470
2471 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2472 if (!q) {
2473 r = -ENOMEM;
2474 goto fail;
2475 }
2476
2477 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2478 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2479 if (r < 0)
2480 goto fail;
2481
2482 if (path_equal(q_resolved, target_resolved)) {
2483
2484 /* Hmm, apparently DynamicUser= was once turned on for this service,
2485 * but is no longer. Let's move the directory back up. */
2486
2487 log_exec_info(context,
2488 params,
2489 "Found pre-existing private %s= directory %s, migrating to %s.\n"
2490 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2491 exec_directory_type_to_string(type), q, p);
2492
2493 r = RET_NERRNO(unlink(p));
2494 if (r < 0)
2495 goto fail;
2496
2497 r = RET_NERRNO(rename(q, p));
2498 if (r < 0)
2499 goto fail;
2500 }
2501 }
2502
2503 r = mkdir_label(p, context->directories[type].mode);
2504 if (r < 0) {
2505 if (r != -EEXIST)
2506 goto fail;
2507
2508 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2509 struct stat st;
2510
2511 /* Don't change the owner/access mode of the configuration directory,
2512 * as in the common case it is not written to by a service, and shall
2513 * not be writable. */
2514
2515 r = RET_NERRNO(stat(p, &st));
2516 if (r < 0)
2517 goto fail;
2518
2519 /* Still complain if the access mode doesn't match */
2520 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2521 log_exec_warning(context,
2522 params,
2523 "%s \'%s\' already exists but the mode is different. "
2524 "(File system: %o %sMode: %o)",
2525 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2526 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2527
2528 continue;
2529 }
2530 }
2531 }
2532
2533 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2534 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2535 * current UID/GID ownership.) */
2536 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2537 if (r < 0)
2538 goto fail;
2539
2540 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2541 * available to user code anyway */
2542 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2543 continue;
2544
2545 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2546 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2547 * assignments to exist. */
2548 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2549 if (r < 0)
2550 goto fail;
2551 }
2552
2553 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2554 * they are set up later, to allow configuring empty var/run/etc. */
2555 if (!needs_mount_namespace)
2556 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2557 r = create_many_symlinks(params->prefix[type],
2558 context->directories[type].items[i].path,
2559 context->directories[type].items[i].symlinks);
2560 if (r < 0)
2561 goto fail;
2562 }
2563
2564 return 0;
2565
2566 fail:
2567 *exit_status = exit_status_table[type];
2568 return r;
2569 }
2570
2571 #if ENABLE_SMACK
2572 static int setup_smack(
2573 const ExecParameters *params,
2574 const ExecContext *context,
2575 int executable_fd) {
2576 int r;
2577
2578 assert(params);
2579 assert(executable_fd >= 0);
2580
2581 if (context->smack_process_label) {
2582 r = mac_smack_apply_pid(0, context->smack_process_label);
2583 if (r < 0)
2584 return r;
2585 } else if (params->fallback_smack_process_label) {
2586 _cleanup_free_ char *exec_label = NULL;
2587
2588 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2589 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2590 return r;
2591
2592 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2593 if (r < 0)
2594 return r;
2595 }
2596
2597 return 0;
2598 }
2599 #endif
2600
2601 static int compile_bind_mounts(
2602 const ExecContext *context,
2603 const ExecParameters *params,
2604 BindMount **ret_bind_mounts,
2605 size_t *ret_n_bind_mounts,
2606 char ***ret_empty_directories) {
2607
2608 _cleanup_strv_free_ char **empty_directories = NULL;
2609 BindMount *bind_mounts = NULL;
2610 size_t n, h = 0;
2611 int r;
2612
2613 assert(context);
2614 assert(params);
2615 assert(ret_bind_mounts);
2616 assert(ret_n_bind_mounts);
2617 assert(ret_empty_directories);
2618
2619 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2620
2621 n = context->n_bind_mounts;
2622 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2623 if (!params->prefix[t])
2624 continue;
2625
2626 for (size_t i = 0; i < context->directories[t].n_items; i++)
2627 n += !context->directories[t].items[i].only_create;
2628 }
2629
2630 if (n <= 0) {
2631 *ret_bind_mounts = NULL;
2632 *ret_n_bind_mounts = 0;
2633 *ret_empty_directories = NULL;
2634 return 0;
2635 }
2636
2637 bind_mounts = new(BindMount, n);
2638 if (!bind_mounts)
2639 return -ENOMEM;
2640
2641 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2642 BindMount *item = context->bind_mounts + i;
2643 _cleanup_free_ char *s = NULL, *d = NULL;
2644
2645 s = strdup(item->source);
2646 if (!s)
2647 return -ENOMEM;
2648
2649 d = strdup(item->destination);
2650 if (!d)
2651 return -ENOMEM;
2652
2653 bind_mounts[h++] = (BindMount) {
2654 .source = TAKE_PTR(s),
2655 .destination = TAKE_PTR(d),
2656 .read_only = item->read_only,
2657 .recursive = item->recursive,
2658 .ignore_enoent = item->ignore_enoent,
2659 };
2660 }
2661
2662 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2663 if (!params->prefix[t])
2664 continue;
2665
2666 if (context->directories[t].n_items == 0)
2667 continue;
2668
2669 if (exec_directory_is_private(context, t) &&
2670 !exec_context_with_rootfs(context)) {
2671 char *private_root;
2672
2673 /* So this is for a dynamic user, and we need to make sure the process can access its own
2674 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2675 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2676
2677 private_root = path_join(params->prefix[t], "private");
2678 if (!private_root)
2679 return -ENOMEM;
2680
2681 r = strv_consume(&empty_directories, private_root);
2682 if (r < 0)
2683 return r;
2684 }
2685
2686 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2687 _cleanup_free_ char *s = NULL, *d = NULL;
2688
2689 /* When one of the parent directories is in the list, we cannot create the symlink
2690 * for the child directory. See also the comments in setup_exec_directory(). */
2691 if (context->directories[t].items[i].only_create)
2692 continue;
2693
2694 if (exec_directory_is_private(context, t))
2695 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2696 else
2697 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2698 if (!s)
2699 return -ENOMEM;
2700
2701 if (exec_directory_is_private(context, t) &&
2702 exec_context_with_rootfs(context))
2703 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2704 * directory is not created on the root directory. So, let's bind-mount the directory
2705 * on the 'non-private' place. */
2706 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2707 else
2708 d = strdup(s);
2709 if (!d)
2710 return -ENOMEM;
2711
2712 bind_mounts[h++] = (BindMount) {
2713 .source = TAKE_PTR(s),
2714 .destination = TAKE_PTR(d),
2715 .read_only = false,
2716 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2717 .recursive = true,
2718 .ignore_enoent = false,
2719 };
2720 }
2721 }
2722
2723 assert(h == n);
2724
2725 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2726 *ret_n_bind_mounts = n;
2727 *ret_empty_directories = TAKE_PTR(empty_directories);
2728
2729 return (int) n;
2730 }
2731
2732 /* ret_symlinks will contain a list of pairs src:dest that describes
2733 * the symlinks to create later on. For example, the symlinks needed
2734 * to safely give private directories to DynamicUser=1 users. */
2735 static int compile_symlinks(
2736 const ExecContext *context,
2737 const ExecParameters *params,
2738 bool setup_os_release_symlink,
2739 char ***ret_symlinks) {
2740
2741 _cleanup_strv_free_ char **symlinks = NULL;
2742 int r;
2743
2744 assert(context);
2745 assert(params);
2746 assert(ret_symlinks);
2747
2748 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2749 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2750 _cleanup_free_ char *private_path = NULL, *path = NULL;
2751
2752 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2753 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2754
2755 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2756 dst_abs = path_join(params->prefix[dt], *symlink);
2757 if (!src_abs || !dst_abs)
2758 return -ENOMEM;
2759
2760 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2761 if (r < 0)
2762 return r;
2763 }
2764
2765 if (!exec_directory_is_private(context, dt) ||
2766 exec_context_with_rootfs(context) ||
2767 context->directories[dt].items[i].only_create)
2768 continue;
2769
2770 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2771 if (!private_path)
2772 return -ENOMEM;
2773
2774 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2775 if (!path)
2776 return -ENOMEM;
2777
2778 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2779 if (r < 0)
2780 return r;
2781 }
2782 }
2783
2784 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2785 * and readers will never get a half-written version. Note that, while the paths specified here are
2786 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2787 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2788 if (setup_os_release_symlink) {
2789 r = strv_extend_many(
2790 &symlinks,
2791 "/run/host/.os-release-stage/os-release",
2792 "/run/host/os-release");
2793 if (r < 0)
2794 return r;
2795 }
2796
2797 *ret_symlinks = TAKE_PTR(symlinks);
2798
2799 return 0;
2800 }
2801
2802 static bool insist_on_sandboxing(
2803 const ExecContext *context,
2804 const char *root_dir,
2805 const char *root_image,
2806 const BindMount *bind_mounts,
2807 size_t n_bind_mounts) {
2808
2809 assert(context);
2810 assert(n_bind_mounts == 0 || bind_mounts);
2811
2812 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2813 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2814 * rearrange stuff in a way we cannot ignore gracefully. */
2815
2816 if (context->n_temporary_filesystems > 0)
2817 return true;
2818
2819 if (root_dir || root_image)
2820 return true;
2821
2822 if (context->n_mount_images > 0)
2823 return true;
2824
2825 if (context->dynamic_user)
2826 return true;
2827
2828 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2829 return true;
2830
2831 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2832 * essential. */
2833 for (size_t i = 0; i < n_bind_mounts; i++)
2834 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2835 return true;
2836
2837 if (context->log_namespace)
2838 return true;
2839
2840 return false;
2841 }
2842
2843 static int setup_ephemeral(
2844 const ExecContext *context,
2845 ExecRuntime *runtime,
2846 char **root_image, /* both input and output! modified if ephemeral logic enabled */
2847 char **root_directory) { /* ditto */
2848
2849 _cleanup_close_ int fd = -EBADF;
2850 _cleanup_free_ char *new_root = NULL;
2851 int r;
2852
2853 assert(context);
2854 assert(root_image);
2855 assert(root_directory);
2856
2857 if (!*root_image && !*root_directory)
2858 return 0;
2859
2860 if (!runtime || !runtime->ephemeral_copy)
2861 return 0;
2862
2863 assert(runtime->ephemeral_storage_socket[0] >= 0);
2864 assert(runtime->ephemeral_storage_socket[1] >= 0);
2865
2866 new_root = strdup(runtime->ephemeral_copy);
2867 if (!new_root)
2868 return log_oom_debug();
2869
2870 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2871 if (r < 0)
2872 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2873
2874 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2875
2876 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2877 if (fd >= 0)
2878 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2879 return 0;
2880 if (fd != -EAGAIN)
2881 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2882
2883 if (*root_image) {
2884 log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
2885
2886 fd = copy_file(*root_image,
2887 new_root,
2888 O_EXCL,
2889 0600,
2890 COPY_LOCK_BSD|
2891 COPY_REFLINK|
2892 COPY_CRTIME);
2893 if (fd < 0)
2894 return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
2895 *root_image, new_root);
2896
2897 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2898 * which tends to not perform well in combination with lots of random writes.
2899 *
2900 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2901 * copy, but we at least want to make the intention clear.
2902 */
2903 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2904 if (r < 0)
2905 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", new_root);
2906 } else {
2907 assert(*root_directory);
2908
2909 log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
2910
2911 fd = btrfs_subvol_snapshot_at(
2912 AT_FDCWD, *root_directory,
2913 AT_FDCWD, new_root,
2914 BTRFS_SNAPSHOT_FALLBACK_COPY |
2915 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2916 BTRFS_SNAPSHOT_RECURSIVE |
2917 BTRFS_SNAPSHOT_LOCK_BSD);
2918 if (fd < 0)
2919 return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
2920 *root_directory, new_root);
2921 }
2922
2923 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2924 if (r < 0)
2925 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2926
2927 if (*root_image)
2928 free_and_replace(*root_image, new_root);
2929 else {
2930 assert(*root_directory);
2931 free_and_replace(*root_directory, new_root);
2932 }
2933
2934 return 1;
2935 }
2936
2937 static int verity_settings_prepare(
2938 VeritySettings *verity,
2939 const char *root_image,
2940 const void *root_hash,
2941 size_t root_hash_size,
2942 const char *root_hash_path,
2943 const void *root_hash_sig,
2944 size_t root_hash_sig_size,
2945 const char *root_hash_sig_path,
2946 const char *verity_data_path) {
2947
2948 int r;
2949
2950 assert(verity);
2951
2952 if (root_hash) {
2953 void *d;
2954
2955 d = memdup(root_hash, root_hash_size);
2956 if (!d)
2957 return -ENOMEM;
2958
2959 free_and_replace(verity->root_hash, d);
2960 verity->root_hash_size = root_hash_size;
2961 verity->designator = PARTITION_ROOT;
2962 }
2963
2964 if (root_hash_sig) {
2965 void *d;
2966
2967 d = memdup(root_hash_sig, root_hash_sig_size);
2968 if (!d)
2969 return -ENOMEM;
2970
2971 free_and_replace(verity->root_hash_sig, d);
2972 verity->root_hash_sig_size = root_hash_sig_size;
2973 verity->designator = PARTITION_ROOT;
2974 }
2975
2976 if (verity_data_path) {
2977 r = free_and_strdup(&verity->data_path, verity_data_path);
2978 if (r < 0)
2979 return r;
2980 }
2981
2982 r = verity_settings_load(
2983 verity,
2984 root_image,
2985 root_hash_path,
2986 root_hash_sig_path);
2987 if (r < 0)
2988 return log_debug_errno(r, "Failed to load root hash: %m");
2989
2990 return 0;
2991 }
2992
2993 static int pick_versions(
2994 const ExecContext *context,
2995 const ExecParameters *params,
2996 char **ret_root_image,
2997 char **ret_root_directory) {
2998
2999 int r;
3000
3001 assert(context);
3002 assert(params);
3003 assert(ret_root_image);
3004 assert(ret_root_directory);
3005
3006 if (context->root_image) {
3007 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3008
3009 r = path_pick(/* toplevel_path= */ NULL,
3010 /* toplevel_fd= */ AT_FDCWD,
3011 context->root_image,
3012 &pick_filter_image_raw,
3013 PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3014 &result);
3015 if (r < 0)
3016 return r;
3017
3018 if (!result.path)
3019 return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
3020
3021 *ret_root_image = TAKE_PTR(result.path);
3022 *ret_root_directory = NULL;
3023 return r;
3024 }
3025
3026 if (context->root_directory) {
3027 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3028
3029 r = path_pick(/* toplevel_path= */ NULL,
3030 /* toplevel_fd= */ AT_FDCWD,
3031 context->root_directory,
3032 &pick_filter_image_dir,
3033 PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3034 &result);
3035 if (r < 0)
3036 return r;
3037
3038 if (!result.path)
3039 return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
3040
3041 *ret_root_image = NULL;
3042 *ret_root_directory = TAKE_PTR(result.path);
3043 return r;
3044 }
3045
3046 *ret_root_image = *ret_root_directory = NULL;
3047 return 0;
3048 }
3049
3050 static int apply_mount_namespace(
3051 ExecCommandFlags command_flags,
3052 const ExecContext *context,
3053 const ExecParameters *params,
3054 ExecRuntime *runtime,
3055 const char *memory_pressure_path,
3056 bool needs_sandboxing,
3057 char **error_path) {
3058
3059 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3060 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3061 **read_write_paths_cleanup = NULL;
3062 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3063 *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
3064 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3065 char **read_write_paths;
3066 bool setup_os_release_symlink;
3067 BindMount *bind_mounts = NULL;
3068 size_t n_bind_mounts = 0;
3069 int r;
3070
3071 assert(context);
3072
3073 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3074
3075 if (params->flags & EXEC_APPLY_CHROOT) {
3076 r = pick_versions(
3077 context,
3078 params,
3079 &root_image,
3080 &root_dir);
3081 if (r < 0)
3082 return r;
3083
3084 r = setup_ephemeral(
3085 context,
3086 runtime,
3087 &root_image,
3088 &root_dir);
3089 if (r < 0)
3090 return r;
3091 }
3092
3093 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3094 if (r < 0)
3095 return r;
3096
3097 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3098 * service will need to write to it in order to start the notifications. */
3099 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3100 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3101 if (!read_write_paths_cleanup)
3102 return -ENOMEM;
3103
3104 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3105 if (r < 0)
3106 return r;
3107
3108 read_write_paths = read_write_paths_cleanup;
3109 } else
3110 read_write_paths = context->read_write_paths;
3111
3112 if (needs_sandboxing) {
3113 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3114 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3115 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3116
3117 if (context->private_tmp && runtime && runtime->shared) {
3118 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3119 tmp_dir = runtime->shared->tmp_dir;
3120 else if (runtime->shared->tmp_dir)
3121 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3122
3123 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3124 var_tmp_dir = runtime->shared->var_tmp_dir;
3125 else if (runtime->shared->var_tmp_dir)
3126 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3127 }
3128 }
3129
3130 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3131 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3132 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3133 if (r < 0)
3134 return r;
3135
3136 if (context->mount_propagation_flag == MS_SHARED)
3137 log_exec_debug(context,
3138 params,
3139 "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3140
3141 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3142 if (r < 0)
3143 return r;
3144
3145 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3146 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3147 if (!propagate_dir)
3148 return -ENOMEM;
3149
3150 incoming_dir = strdup("/run/systemd/incoming");
3151 if (!incoming_dir)
3152 return -ENOMEM;
3153
3154 extension_dir = strdup("/run/systemd/unit-extensions");
3155 if (!extension_dir)
3156 return -ENOMEM;
3157
3158 /* If running under a different root filesystem, propagate the host's os-release. We make a
3159 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3160 if (setup_os_release_symlink) {
3161 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3162 if (!host_os_release_stage)
3163 return -ENOMEM;
3164 }
3165 } else {
3166 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3167
3168 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3169 return -ENOMEM;
3170
3171 if (setup_os_release_symlink) {
3172 if (asprintf(&host_os_release_stage,
3173 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3174 geteuid()) < 0)
3175 return -ENOMEM;
3176 }
3177 }
3178
3179 if (root_image) {
3180 r = verity_settings_prepare(
3181 &verity,
3182 root_image,
3183 context->root_hash, context->root_hash_size, context->root_hash_path,
3184 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3185 context->root_verity);
3186 if (r < 0)
3187 return r;
3188 }
3189
3190 NamespaceParameters parameters = {
3191 .runtime_scope = params->runtime_scope,
3192
3193 .root_directory = root_dir,
3194 .root_image = root_image,
3195 .root_image_options = context->root_image_options,
3196 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3197
3198 .read_write_paths = read_write_paths,
3199 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3200 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3201
3202 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3203 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3204
3205 .empty_directories = empty_directories,
3206 .symlinks = symlinks,
3207
3208 .bind_mounts = bind_mounts,
3209 .n_bind_mounts = n_bind_mounts,
3210
3211 .temporary_filesystems = context->temporary_filesystems,
3212 .n_temporary_filesystems = context->n_temporary_filesystems,
3213
3214 .mount_images = context->mount_images,
3215 .n_mount_images = context->n_mount_images,
3216 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3217
3218 .tmp_dir = tmp_dir,
3219 .var_tmp_dir = var_tmp_dir,
3220
3221 .creds_path = creds_path,
3222 .log_namespace = context->log_namespace,
3223 .mount_propagation_flag = context->mount_propagation_flag,
3224
3225 .verity = &verity,
3226
3227 .extension_images = context->extension_images,
3228 .n_extension_images = context->n_extension_images,
3229 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3230 .extension_directories = context->extension_directories,
3231
3232 .propagate_dir = propagate_dir,
3233 .incoming_dir = incoming_dir,
3234 .extension_dir = extension_dir,
3235 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3236 .host_os_release_stage = host_os_release_stage,
3237
3238 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3239 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3240 * sandbox inside the mount namespace. */
3241 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3242
3243 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3244 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3245 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3246 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3247 .protect_hostname = needs_sandboxing && context->protect_hostname,
3248
3249 .private_dev = needs_sandboxing && context->private_devices,
3250 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3251 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3252
3253 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3254
3255 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3256 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3257
3258 .protect_home = needs_sandboxing ? context->protect_home : false,
3259 .protect_system = needs_sandboxing ? context->protect_system : false,
3260 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3261 .proc_subset = needs_sandboxing ? context->proc_subset : false,
3262 };
3263
3264 r = setup_namespace(&parameters, error_path);
3265 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3266 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3267 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3268 * completely different execution environment. */
3269 if (r == -ENOANO) {
3270 if (insist_on_sandboxing(
3271 context,
3272 root_dir, root_image,
3273 bind_mounts,
3274 n_bind_mounts))
3275 return log_exec_debug_errno(context,
3276 params,
3277 SYNTHETIC_ERRNO(EOPNOTSUPP),
3278 "Failed to set up namespace, and refusing to continue since "
3279 "the selected namespacing options alter mount environment non-trivially.\n"
3280 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3281 n_bind_mounts,
3282 context->n_temporary_filesystems,
3283 yes_no(root_dir),
3284 yes_no(root_image),
3285 yes_no(context->dynamic_user));
3286
3287 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3288 return 0;
3289 }
3290
3291 return r;
3292 }
3293
3294 static int apply_working_directory(
3295 const ExecContext *context,
3296 const ExecParameters *params,
3297 ExecRuntime *runtime,
3298 const char *home,
3299 int *exit_status) {
3300
3301 const char *wd;
3302 int r;
3303
3304 assert(context);
3305 assert(exit_status);
3306
3307 if (context->working_directory_home) {
3308 if (!home) {
3309 *exit_status = EXIT_CHDIR;
3310 return -ENXIO;
3311 }
3312
3313 wd = home;
3314 } else
3315 wd = empty_to_root(context->working_directory);
3316
3317 if (params->flags & EXEC_APPLY_CHROOT)
3318 r = RET_NERRNO(chdir(wd));
3319 else {
3320 _cleanup_close_ int dfd = -EBADF;
3321
3322 r = chase(wd,
3323 (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory,
3324 CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3325 /* ret_path= */ NULL,
3326 &dfd);
3327 if (r >= 0)
3328 r = RET_NERRNO(fchdir(dfd));
3329 }
3330
3331 if (r < 0 && !context->working_directory_missing_ok) {
3332 *exit_status = EXIT_CHDIR;
3333 return r;
3334 }
3335
3336 return 0;
3337 }
3338
3339 static int apply_root_directory(
3340 const ExecContext *context,
3341 const ExecParameters *params,
3342 ExecRuntime *runtime,
3343 const bool needs_mount_ns,
3344 int *exit_status) {
3345
3346 assert(context);
3347 assert(exit_status);
3348
3349 if (params->flags & EXEC_APPLY_CHROOT)
3350 if (!needs_mount_ns && context->root_directory)
3351 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3352 *exit_status = EXIT_CHROOT;
3353 return -errno;
3354 }
3355
3356 return 0;
3357 }
3358
3359 static int setup_keyring(
3360 const ExecContext *context,
3361 const ExecParameters *p,
3362 uid_t uid, gid_t gid) {
3363
3364 key_serial_t keyring;
3365 int r = 0;
3366 uid_t saved_uid;
3367 gid_t saved_gid;
3368
3369 assert(context);
3370 assert(p);
3371
3372 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3373 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3374 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3375 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3376 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3377 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3378
3379 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3380 return 0;
3381
3382 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3383 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3384 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3385 * & group is just as nasty as acquiring a reference to the user keyring. */
3386
3387 saved_uid = getuid();
3388 saved_gid = getgid();
3389
3390 if (gid_is_valid(gid) && gid != saved_gid) {
3391 if (setregid(gid, -1) < 0)
3392 return log_exec_error_errno(context,
3393 p,
3394 errno,
3395 "Failed to change GID for user keyring: %m");
3396 }
3397
3398 if (uid_is_valid(uid) && uid != saved_uid) {
3399 if (setreuid(uid, -1) < 0) {
3400 r = log_exec_error_errno(context,
3401 p,
3402 errno,
3403 "Failed to change UID for user keyring: %m");
3404 goto out;
3405 }
3406 }
3407
3408 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3409 if (keyring == -1) {
3410 if (errno == ENOSYS)
3411 log_exec_debug_errno(context,
3412 p,
3413 errno,
3414 "Kernel keyring not supported, ignoring.");
3415 else if (ERRNO_IS_PRIVILEGE(errno))
3416 log_exec_debug_errno(context,
3417 p,
3418 errno,
3419 "Kernel keyring access prohibited, ignoring.");
3420 else if (errno == EDQUOT)
3421 log_exec_debug_errno(context,
3422 p,
3423 errno,
3424 "Out of kernel keyrings to allocate, ignoring.");
3425 else
3426 r = log_exec_error_errno(context,
3427 p,
3428 errno,
3429 "Setting up kernel keyring failed: %m");
3430
3431 goto out;
3432 }
3433
3434 /* When requested link the user keyring into the session keyring. */
3435 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3436
3437 if (keyctl(KEYCTL_LINK,
3438 KEY_SPEC_USER_KEYRING,
3439 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3440 r = log_exec_error_errno(context,
3441 p,
3442 errno,
3443 "Failed to link user keyring into session keyring: %m");
3444 goto out;
3445 }
3446 }
3447
3448 /* Restore uid/gid back */
3449 if (uid_is_valid(uid) && uid != saved_uid) {
3450 if (setreuid(saved_uid, -1) < 0) {
3451 r = log_exec_error_errno(context,
3452 p,
3453 errno,
3454 "Failed to change UID back for user keyring: %m");
3455 goto out;
3456 }
3457 }
3458
3459 if (gid_is_valid(gid) && gid != saved_gid) {
3460 if (setregid(saved_gid, -1) < 0)
3461 return log_exec_error_errno(context,
3462 p,
3463 errno,
3464 "Failed to change GID back for user keyring: %m");
3465 }
3466
3467 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3468 if (!sd_id128_is_null(p->invocation_id)) {
3469 key_serial_t key;
3470
3471 key = add_key("user",
3472 "invocation_id",
3473 &p->invocation_id,
3474 sizeof(p->invocation_id),
3475 KEY_SPEC_SESSION_KEYRING);
3476 if (key == -1)
3477 log_exec_debug_errno(context,
3478 p,
3479 errno,
3480 "Failed to add invocation ID to keyring, ignoring: %m");
3481 else {
3482 if (keyctl(KEYCTL_SETPERM, key,
3483 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3484 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3485 r = log_exec_error_errno(context,
3486 p,
3487 errno,
3488 "Failed to restrict invocation ID permission: %m");
3489 }
3490 }
3491
3492 out:
3493 /* Revert back uid & gid for the last time, and exit */
3494 /* no extra logging, as only the first already reported error matters */
3495 if (getuid() != saved_uid)
3496 (void) setreuid(saved_uid, -1);
3497
3498 if (getgid() != saved_gid)
3499 (void) setregid(saved_gid, -1);
3500
3501 return r;
3502 }
3503
3504 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3505 assert(array);
3506 assert(n);
3507 assert(pair);
3508
3509 if (pair[0] >= 0)
3510 array[(*n)++] = pair[0];
3511 if (pair[1] >= 0)
3512 array[(*n)++] = pair[1];
3513 }
3514
3515 static int close_remaining_fds(
3516 const ExecParameters *params,
3517 const ExecRuntime *runtime,
3518 int socket_fd,
3519 const int *fds, size_t n_fds) {
3520
3521 size_t n_dont_close = 0;
3522 int dont_close[n_fds + 14];
3523
3524 assert(params);
3525
3526 if (params->stdin_fd >= 0)
3527 dont_close[n_dont_close++] = params->stdin_fd;
3528 if (params->stdout_fd >= 0)
3529 dont_close[n_dont_close++] = params->stdout_fd;
3530 if (params->stderr_fd >= 0)
3531 dont_close[n_dont_close++] = params->stderr_fd;
3532
3533 if (socket_fd >= 0)
3534 dont_close[n_dont_close++] = socket_fd;
3535 if (n_fds > 0) {
3536 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3537 n_dont_close += n_fds;
3538 }
3539
3540 if (runtime)
3541 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3542
3543 if (runtime && runtime->shared) {
3544 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3545 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3546 }
3547
3548 if (runtime && runtime->dynamic_creds) {
3549 if (runtime->dynamic_creds->user)
3550 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3551 if (runtime->dynamic_creds->group)
3552 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3553 }
3554
3555 if (params->user_lookup_fd >= 0)
3556 dont_close[n_dont_close++] = params->user_lookup_fd;
3557
3558 return close_all_fds(dont_close, n_dont_close);
3559 }
3560
3561 static int send_user_lookup(
3562 const char *unit_id,
3563 int user_lookup_fd,
3564 uid_t uid,
3565 gid_t gid) {
3566
3567 assert(unit_id);
3568
3569 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3570 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3571 * specified. */
3572
3573 if (user_lookup_fd < 0)
3574 return 0;
3575
3576 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3577 return 0;
3578
3579 if (writev(user_lookup_fd,
3580 (struct iovec[]) {
3581 IOVEC_MAKE(&uid, sizeof(uid)),
3582 IOVEC_MAKE(&gid, sizeof(gid)),
3583 IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3584 return -errno;
3585
3586 return 0;
3587 }
3588
3589 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3590 int r;
3591
3592 assert(c);
3593 assert(home);
3594 assert(buf);
3595
3596 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3597
3598 if (*home)
3599 return 0;
3600
3601 if (!c->working_directory_home)
3602 return 0;
3603
3604 r = get_home_dir(buf);
3605 if (r < 0)
3606 return r;
3607
3608 *home = *buf;
3609 return 1;
3610 }
3611
3612 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3613 _cleanup_strv_free_ char ** list = NULL;
3614 int r;
3615
3616 assert(c);
3617 assert(p);
3618 assert(ret);
3619
3620 assert(c->dynamic_user);
3621
3622 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3623 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3624 * directories. */
3625
3626 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3627 if (t == EXEC_DIRECTORY_CONFIGURATION)
3628 continue;
3629
3630 if (!p->prefix[t])
3631 continue;
3632
3633 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3634 char *e;
3635
3636 if (exec_directory_is_private(c, t))
3637 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3638 else
3639 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3640 if (!e)
3641 return -ENOMEM;
3642
3643 r = strv_consume(&list, e);
3644 if (r < 0)
3645 return r;
3646 }
3647 }
3648
3649 *ret = TAKE_PTR(list);
3650
3651 return 0;
3652 }
3653
3654 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3655 _cleanup_(cpu_set_reset) CPUSet s = {};
3656 int r;
3657
3658 assert(c);
3659 assert(ret);
3660
3661 if (!c->numa_policy.nodes.set) {
3662 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3663 return 0;
3664 }
3665
3666 r = numa_to_cpu_set(&c->numa_policy, &s);
3667 if (r < 0)
3668 return r;
3669
3670 cpu_set_reset(ret);
3671
3672 return cpu_set_add_all(ret, &s);
3673 }
3674
3675 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
3676 int r;
3677
3678 assert(fds);
3679 assert(n_fds);
3680 assert(*n_fds < fds_size);
3681 assert(fd);
3682
3683 if (*fd < 0)
3684 return 0;
3685
3686 if (*fd < 3 + (int) *n_fds) {
3687 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3688 * the fds we pass to the process (or which are closed only during execve). */
3689
3690 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3691 if (r < 0)
3692 return -errno;
3693
3694 close_and_replace(*fd, r);
3695 }
3696
3697 fds[(*n_fds)++] = *fd;
3698 return 1;
3699 }
3700
3701 static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3702 union sockaddr_union addr = {
3703 .un.sun_family = AF_UNIX,
3704 };
3705 socklen_t sa_len;
3706 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3707 int r;
3708
3709 assert(c);
3710 assert(p);
3711 assert(of);
3712 assert(ofd >= 0);
3713
3714 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3715 if (r < 0)
3716 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3717
3718 sa_len = r;
3719
3720 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3721 _cleanup_close_ int fd = -EBADF;
3722
3723 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3724 if (fd < 0)
3725 return log_exec_error_errno(c,
3726 p,
3727 errno,
3728 "Failed to create socket for %s: %m",
3729 of->path);
3730
3731 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3732 if (r == -EPROTOTYPE)
3733 continue;
3734 if (r < 0)
3735 return log_exec_error_errno(c,
3736 p,
3737 r,
3738 "Failed to connect socket for %s: %m",
3739 of->path);
3740
3741 return TAKE_FD(fd);
3742 }
3743
3744 return log_exec_error_errno(c,
3745 p,
3746 SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3747 of->path);
3748 }
3749
3750 static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3751 struct stat st;
3752 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3753
3754 assert(c);
3755 assert(p);
3756 assert(of);
3757
3758 ofd = open(of->path, O_PATH | O_CLOEXEC);
3759 if (ofd < 0)
3760 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3761
3762 if (fstat(ofd, &st) < 0)
3763 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3764
3765 if (S_ISSOCK(st.st_mode)) {
3766 fd = connect_unix_harder(c, p, of, ofd);
3767 if (fd < 0)
3768 return fd;
3769
3770 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3771 return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3772 of->path);
3773
3774 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3775 } else {
3776 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3777 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3778 flags |= O_APPEND;
3779 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3780 flags |= O_TRUNC;
3781
3782 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3783 if (fd < 0)
3784 return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3785
3786 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3787 }
3788
3789 return TAKE_FD(fd);
3790 }
3791
3792 static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
3793 int r;
3794
3795 assert(c);
3796 assert(p);
3797 assert(n_fds);
3798
3799 LIST_FOREACH(open_files, of, p->open_files) {
3800 _cleanup_close_ int fd = -EBADF;
3801
3802 fd = get_open_file_fd(c, p, of);
3803 if (fd < 0) {
3804 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3805 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3806 continue;
3807 }
3808
3809 return fd;
3810 }
3811
3812 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3813 return -ENOMEM;
3814
3815 r = strv_extend(&p->fd_names, of->fdname);
3816 if (r < 0)
3817 return r;
3818
3819 p->fds[*n_fds] = TAKE_FD(fd);
3820
3821 (*n_fds)++;
3822 }
3823
3824 return 0;
3825 }
3826
3827 static void log_command_line(
3828 const ExecContext *context,
3829 const ExecParameters *params,
3830 const char *msg,
3831 const char *executable,
3832 char **argv) {
3833
3834 assert(context);
3835 assert(params);
3836 assert(msg);
3837 assert(executable);
3838
3839 if (!DEBUG_LOGGING)
3840 return;
3841
3842 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3843
3844 log_exec_struct(context, params, LOG_DEBUG,
3845 "EXECUTABLE=%s", executable,
3846 LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3847 LOG_EXEC_INVOCATION_ID(params));
3848 }
3849
3850 static bool exec_context_need_unprivileged_private_users(
3851 const ExecContext *context,
3852 const ExecParameters *params) {
3853
3854 assert(context);
3855 assert(params);
3856
3857 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3858 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3859 * (system manager) then we have privileges and don't need this. */
3860 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3861 return false;
3862
3863 return context->private_users ||
3864 context->private_tmp ||
3865 context->private_devices ||
3866 context->private_network ||
3867 context->network_namespace_path ||
3868 context->private_ipc ||
3869 context->ipc_namespace_path ||
3870 context->private_mounts > 0 ||
3871 context->mount_apivfs ||
3872 context->n_bind_mounts > 0 ||
3873 context->n_temporary_filesystems > 0 ||
3874 context->root_directory ||
3875 !strv_isempty(context->extension_directories) ||
3876 context->protect_system != PROTECT_SYSTEM_NO ||
3877 context->protect_home != PROTECT_HOME_NO ||
3878 context->protect_kernel_tunables ||
3879 context->protect_kernel_modules ||
3880 context->protect_kernel_logs ||
3881 context->protect_control_groups ||
3882 context->protect_clock ||
3883 context->protect_hostname ||
3884 !strv_isempty(context->read_write_paths) ||
3885 !strv_isempty(context->read_only_paths) ||
3886 !strv_isempty(context->inaccessible_paths) ||
3887 !strv_isempty(context->exec_paths) ||
3888 !strv_isempty(context->no_exec_paths);
3889 }
3890
3891 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3892 assert(context);
3893
3894 if (confirm_spawn_disabled())
3895 return false;
3896
3897 /* For some reasons units remaining in the same process group
3898 * as PID 1 fail to acquire the console even if it's not used
3899 * by any process. So skip the confirmation question for them. */
3900 return !context->same_pgrp;
3901 }
3902
3903 static int exec_context_named_iofds(
3904 const ExecContext *c,
3905 const ExecParameters *p,
3906 int named_iofds[static 3]) {
3907
3908 size_t targets;
3909 const char* stdio_fdname[3];
3910 size_t n_fds;
3911
3912 assert(c);
3913 assert(p);
3914 assert(named_iofds);
3915
3916 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3917 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3918 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3919
3920 for (size_t i = 0; i < 3; i++)
3921 stdio_fdname[i] = exec_context_fdname(c, i);
3922
3923 n_fds = p->n_storage_fds + p->n_socket_fds;
3924
3925 for (size_t i = 0; i < n_fds && targets > 0; i++)
3926 if (named_iofds[STDIN_FILENO] < 0 &&
3927 c->std_input == EXEC_INPUT_NAMED_FD &&
3928 stdio_fdname[STDIN_FILENO] &&
3929 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3930
3931 named_iofds[STDIN_FILENO] = p->fds[i];
3932 targets--;
3933
3934 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3935 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3936 stdio_fdname[STDOUT_FILENO] &&
3937 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3938
3939 named_iofds[STDOUT_FILENO] = p->fds[i];
3940 targets--;
3941
3942 } else if (named_iofds[STDERR_FILENO] < 0 &&
3943 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3944 stdio_fdname[STDERR_FILENO] &&
3945 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3946
3947 named_iofds[STDERR_FILENO] = p->fds[i];
3948 targets--;
3949 }
3950
3951 return targets == 0 ? 0 : -ENOENT;
3952 }
3953
3954 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3955 if (!shared)
3956 return;
3957
3958 safe_close_pair(shared->netns_storage_socket);
3959 safe_close_pair(shared->ipcns_storage_socket);
3960 }
3961
3962 static void exec_runtime_close(ExecRuntime *rt) {
3963 if (!rt)
3964 return;
3965
3966 safe_close_pair(rt->ephemeral_storage_socket);
3967
3968 exec_shared_runtime_close(rt->shared);
3969 dynamic_creds_close(rt->dynamic_creds);
3970 }
3971
3972 static void exec_params_close(ExecParameters *p) {
3973 if (!p)
3974 return;
3975
3976 p->stdin_fd = safe_close(p->stdin_fd);
3977 p->stdout_fd = safe_close(p->stdout_fd);
3978 p->stderr_fd = safe_close(p->stderr_fd);
3979 }
3980
3981 int exec_invoke(
3982 const ExecCommand *command,
3983 const ExecContext *context,
3984 ExecParameters *params,
3985 ExecRuntime *runtime,
3986 const CGroupContext *cgroup_context,
3987 int *exit_status) {
3988
3989 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3990 int r, ngids = 0;
3991 _cleanup_free_ gid_t *supplementary_gids = NULL;
3992 const char *username = NULL, *groupname = NULL;
3993 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3994 const char *home = NULL, *shell = NULL;
3995 char **final_argv = NULL;
3996 dev_t journal_stream_dev = 0;
3997 ino_t journal_stream_ino = 0;
3998 bool userns_set_up = false;
3999 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4000 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4001 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4002 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
4003 bool keep_seccomp_privileges = false;
4004 #if HAVE_SELINUX
4005 _cleanup_free_ char *mac_selinux_context_net = NULL;
4006 bool use_selinux = false;
4007 #endif
4008 #if ENABLE_SMACK
4009 bool use_smack = false;
4010 #endif
4011 #if HAVE_APPARMOR
4012 bool use_apparmor = false;
4013 #endif
4014 #if HAVE_SECCOMP
4015 uint64_t saved_bset = 0;
4016 #endif
4017 uid_t saved_uid = getuid();
4018 gid_t saved_gid = getgid();
4019 uid_t uid = UID_INVALID;
4020 gid_t gid = GID_INVALID;
4021 size_t n_fds, /* fds to pass to the child */
4022 n_keep_fds; /* total number of fds not to close */
4023 int secure_bits;
4024 _cleanup_free_ gid_t *gids_after_pam = NULL;
4025 int ngids_after_pam = 0;
4026
4027 int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
4028 size_t n_storage_fds, n_socket_fds;
4029
4030 assert(command);
4031 assert(context);
4032 assert(params);
4033 assert(exit_status);
4034
4035 /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
4036 * and is already applied earlier. Just for safety. */
4037 if (context->log_level_max >= 0)
4038 log_set_max_level(context->log_level_max);
4039
4040 /* Explicitly test for CVE-2021-4034 inspired invocations */
4041 if (!command->path || strv_isempty(command->argv)) {
4042 *exit_status = EXIT_EXEC;
4043 return log_exec_error_errno(
4044 context,
4045 params,
4046 SYNTHETIC_ERRNO(EINVAL),
4047 "Invalid command line arguments.");
4048 }
4049
4050 LOG_CONTEXT_PUSH_EXEC(context, params);
4051
4052 if (context->std_input == EXEC_INPUT_SOCKET ||
4053 context->std_output == EXEC_OUTPUT_SOCKET ||
4054 context->std_error == EXEC_OUTPUT_SOCKET) {
4055
4056 if (params->n_socket_fds > 1)
4057 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4058
4059 if (params->n_socket_fds == 0)
4060 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4061
4062 socket_fd = params->fds[0];
4063 n_storage_fds = n_socket_fds = 0;
4064 } else {
4065 n_socket_fds = params->n_socket_fds;
4066 n_storage_fds = params->n_storage_fds;
4067 }
4068 n_fds = n_socket_fds + n_storage_fds;
4069
4070 r = exec_context_named_iofds(context, params, named_iofds);
4071 if (r < 0)
4072 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
4073
4074 rename_process_from_path(command->path);
4075
4076 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4077 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4078 * both of which will be demoted to SIG_DFL. */
4079 (void) default_signals(SIGNALS_CRASH_HANDLER,
4080 SIGNALS_IGNORE);
4081
4082 if (context->ignore_sigpipe)
4083 (void) ignore_signals(SIGPIPE);
4084
4085 r = reset_signal_mask();
4086 if (r < 0) {
4087 *exit_status = EXIT_SIGNAL_MASK;
4088 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4089 }
4090
4091 if (params->idle_pipe)
4092 do_idle_pipe_dance(params->idle_pipe);
4093
4094 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4095 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4096 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4097 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4098
4099 log_forget_fds();
4100 log_set_open_when_needed(true);
4101 log_settle_target();
4102
4103 /* In case anything used libc syslog(), close this here, too */
4104 closelog();
4105
4106 r = collect_open_file_fds(context, params, &n_fds);
4107 if (r < 0) {
4108 *exit_status = EXIT_FDS;
4109 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4110 }
4111
4112 int keep_fds[n_fds + 3];
4113 memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4114 n_keep_fds = n_fds;
4115
4116 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4117 if (r < 0) {
4118 *exit_status = EXIT_FDS;
4119 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4120 }
4121
4122 #if HAVE_LIBBPF
4123 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
4124 if (r < 0) {
4125 *exit_status = EXIT_FDS;
4126 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4127 }
4128 #endif
4129
4130 r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4131 if (r < 0) {
4132 *exit_status = EXIT_FDS;
4133 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4134 }
4135
4136 if (!context->same_pgrp &&
4137 setsid() < 0) {
4138 *exit_status = EXIT_SETSID;
4139 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4140 }
4141
4142 exec_context_tty_reset(context, params);
4143
4144 if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4145 _cleanup_free_ char *cmdline = NULL;
4146
4147 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4148 if (!cmdline) {
4149 *exit_status = EXIT_MEMORY;
4150 return log_oom();
4151 }
4152
4153 r = ask_for_confirmation(context, params, cmdline);
4154 if (r != CONFIRM_EXECUTE) {
4155 if (r == CONFIRM_PRETEND_SUCCESS) {
4156 *exit_status = EXIT_SUCCESS;
4157 return 0;
4158 }
4159
4160 *exit_status = EXIT_CONFIRM;
4161 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4162 "Execution cancelled by the user");
4163 }
4164 }
4165
4166 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4167 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4168 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4169 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4170 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4171 if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4172 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4173 *exit_status = EXIT_MEMORY;
4174 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4175 }
4176
4177 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4178 _cleanup_strv_free_ char **suggested_paths = NULL;
4179
4180 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4181 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4182 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4183 *exit_status = EXIT_USER;
4184 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4185 }
4186
4187 r = compile_suggested_paths(context, params, &suggested_paths);
4188 if (r < 0) {
4189 *exit_status = EXIT_MEMORY;
4190 return log_oom();
4191 }
4192
4193 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4194 if (r < 0) {
4195 *exit_status = EXIT_USER;
4196 if (r == -EILSEQ)
4197 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4198 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4199 return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4200 }
4201
4202 if (!uid_is_valid(uid)) {
4203 *exit_status = EXIT_USER;
4204 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4205 }
4206
4207 if (!gid_is_valid(gid)) {
4208 *exit_status = EXIT_USER;
4209 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4210 }
4211
4212 if (runtime->dynamic_creds->user)
4213 username = runtime->dynamic_creds->user->name;
4214
4215 } else {
4216 if (context->user) {
4217 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4218 if (r < 0) {
4219 *exit_status = EXIT_USER;
4220 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4221 }
4222 }
4223
4224 if (context->group) {
4225 r = get_fixed_group(context->group, &groupname, &gid);
4226 if (r < 0) {
4227 *exit_status = EXIT_GROUP;
4228 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4229 }
4230 }
4231 }
4232
4233 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4234 r = get_supplementary_groups(context, username, groupname, gid,
4235 &supplementary_gids, &ngids);
4236 if (r < 0) {
4237 *exit_status = EXIT_GROUP;
4238 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4239 }
4240
4241 r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4242 if (r < 0) {
4243 *exit_status = EXIT_USER;
4244 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4245 }
4246
4247 params->user_lookup_fd = safe_close(params->user_lookup_fd);
4248
4249 r = acquire_home(context, uid, &home, &home_buffer);
4250 if (r < 0) {
4251 *exit_status = EXIT_CHDIR;
4252 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4253 }
4254
4255 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4256 if (socket_fd >= 0)
4257 (void) fd_nonblock(socket_fd, false);
4258
4259 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4260 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4261 if (params->cgroup_path) {
4262 _cleanup_free_ char *p = NULL;
4263
4264 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4265 if (r < 0) {
4266 *exit_status = EXIT_CGROUP;
4267 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4268 }
4269
4270 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4271 if (r == -EUCLEAN) {
4272 *exit_status = EXIT_CGROUP;
4273 return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4274 "because the cgroup or one of its parents or "
4275 "siblings is in the threaded mode: %m", p);
4276 }
4277 if (r < 0) {
4278 *exit_status = EXIT_CGROUP;
4279 return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4280 }
4281 }
4282
4283 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4284 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4285 if (r < 0) {
4286 *exit_status = EXIT_NETWORK;
4287 return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4288 }
4289 }
4290
4291 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4292 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4293 if (r < 0) {
4294 *exit_status = EXIT_NAMESPACE;
4295 return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4296 }
4297 }
4298
4299 r = setup_input(context, params, socket_fd, named_iofds);
4300 if (r < 0) {
4301 *exit_status = EXIT_STDIN;
4302 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4303 }
4304
4305 r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4306 if (r < 0) {
4307 *exit_status = EXIT_STDOUT;
4308 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4309 }
4310
4311 r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4312 if (r < 0) {
4313 *exit_status = EXIT_STDERR;
4314 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4315 }
4316
4317 if (context->oom_score_adjust_set) {
4318 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4319 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4320 r = set_oom_score_adjust(context->oom_score_adjust);
4321 if (ERRNO_IS_NEG_PRIVILEGE(r))
4322 log_exec_debug_errno(context, params, r,
4323 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4324 else if (r < 0) {
4325 *exit_status = EXIT_OOM_ADJUST;
4326 return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4327 }
4328 }
4329
4330 if (context->coredump_filter_set) {
4331 r = set_coredump_filter(context->coredump_filter);
4332 if (ERRNO_IS_NEG_PRIVILEGE(r))
4333 log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4334 else if (r < 0) {
4335 *exit_status = EXIT_LIMITS;
4336 return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4337 }
4338 }
4339
4340 if (context->nice_set) {
4341 r = setpriority_closest(context->nice);
4342 if (r < 0) {
4343 *exit_status = EXIT_NICE;
4344 return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4345 }
4346 }
4347
4348 if (context->cpu_sched_set) {
4349 struct sched_param param = {
4350 .sched_priority = context->cpu_sched_priority,
4351 };
4352
4353 r = sched_setscheduler(0,
4354 context->cpu_sched_policy |
4355 (context->cpu_sched_reset_on_fork ?
4356 SCHED_RESET_ON_FORK : 0),
4357 &param);
4358 if (r < 0) {
4359 *exit_status = EXIT_SETSCHEDULER;
4360 return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4361 }
4362 }
4363
4364 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4365 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4366 const CPUSet *cpu_set;
4367
4368 if (context->cpu_affinity_from_numa) {
4369 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4370 if (r < 0) {
4371 *exit_status = EXIT_CPUAFFINITY;
4372 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4373 }
4374
4375 cpu_set = &converted_cpu_set;
4376 } else
4377 cpu_set = &context->cpu_set;
4378
4379 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4380 *exit_status = EXIT_CPUAFFINITY;
4381 return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4382 }
4383 }
4384
4385 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4386 r = apply_numa_policy(&context->numa_policy);
4387 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4388 log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4389 else if (r < 0) {
4390 *exit_status = EXIT_NUMA_POLICY;
4391 return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4392 }
4393 }
4394
4395 if (context->ioprio_set)
4396 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4397 *exit_status = EXIT_IOPRIO;
4398 return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4399 }
4400
4401 if (context->timer_slack_nsec != NSEC_INFINITY)
4402 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4403 *exit_status = EXIT_TIMERSLACK;
4404 return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4405 }
4406
4407 if (context->personality != PERSONALITY_INVALID) {
4408 r = safe_personality(context->personality);
4409 if (r < 0) {
4410 *exit_status = EXIT_PERSONALITY;
4411 return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4412 }
4413 }
4414
4415 #if ENABLE_UTMP
4416 if (context->utmp_id) {
4417 _cleanup_free_ char *username_alloc = NULL;
4418
4419 if (!username && context->utmp_mode == EXEC_UTMP_USER) {
4420 username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
4421 if (!username_alloc) {
4422 *exit_status = EXIT_USER;
4423 return log_oom();
4424 }
4425 }
4426
4427 const char *line = context->tty_path ?
4428 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4429 NULL;
4430 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4431 line,
4432 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4433 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4434 USER_PROCESS,
4435 username ?: username_alloc);
4436 }
4437 #endif
4438
4439 if (uid_is_valid(uid)) {
4440 r = chown_terminal(STDIN_FILENO, uid);
4441 if (r < 0) {
4442 *exit_status = EXIT_STDIN;
4443 return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4444 }
4445 }
4446
4447 if (params->cgroup_path) {
4448 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4449 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4450 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4451 * touch a single hierarchy too. */
4452
4453 if (params->flags & EXEC_CGROUP_DELEGATE) {
4454 _cleanup_free_ char *p = NULL;
4455
4456 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4457 if (r < 0) {
4458 *exit_status = EXIT_CGROUP;
4459 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4460 }
4461
4462 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4463 if (r < 0) {
4464 *exit_status = EXIT_CGROUP;
4465 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4466 }
4467 if (r > 0) {
4468 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4469 if (r < 0) {
4470 *exit_status = EXIT_CGROUP;
4471 return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4472 }
4473 }
4474 }
4475
4476 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4477 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4478 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4479 if (r < 0) {
4480 *exit_status = EXIT_MEMORY;
4481 return log_oom();
4482 }
4483
4484 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4485 if (r < 0) {
4486 log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4487 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4488 memory_pressure_path = mfree(memory_pressure_path);
4489 }
4490 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4491 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4492 if (!memory_pressure_path) {
4493 *exit_status = EXIT_MEMORY;
4494 return log_oom();
4495 }
4496 }
4497 }
4498 }
4499
4500 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4501
4502 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4503 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4504 if (r < 0)
4505 return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4506 }
4507
4508 r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4509 if (r < 0) {
4510 *exit_status = EXIT_CREDENTIALS;
4511 return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4512 }
4513
4514 r = build_environment(
4515 context,
4516 params,
4517 cgroup_context,
4518 n_fds,
4519 home,
4520 username,
4521 shell,
4522 journal_stream_dev,
4523 journal_stream_ino,
4524 memory_pressure_path,
4525 &our_env);
4526 if (r < 0) {
4527 *exit_status = EXIT_MEMORY;
4528 return log_oom();
4529 }
4530
4531 r = build_pass_environment(context, &pass_env);
4532 if (r < 0) {
4533 *exit_status = EXIT_MEMORY;
4534 return log_oom();
4535 }
4536
4537 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4538 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4539 * not specify PATH but the unit has ExecSearchPath. */
4540 if (!strv_isempty(context->exec_search_path)) {
4541 _cleanup_free_ char *joined = NULL;
4542
4543 joined = strv_join(context->exec_search_path, ":");
4544 if (!joined) {
4545 *exit_status = EXIT_MEMORY;
4546 return log_oom();
4547 }
4548
4549 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4550 if (r < 0) {
4551 *exit_status = EXIT_MEMORY;
4552 return log_oom();
4553 }
4554 }
4555
4556 accum_env = strv_env_merge(params->environment,
4557 our_env,
4558 joined_exec_search_path,
4559 pass_env,
4560 context->environment,
4561 params->files_env);
4562 if (!accum_env) {
4563 *exit_status = EXIT_MEMORY;
4564 return log_oom();
4565 }
4566 accum_env = strv_env_clean(accum_env);
4567
4568 (void) umask(context->umask);
4569
4570 r = setup_keyring(context, params, uid, gid);
4571 if (r < 0) {
4572 *exit_status = EXIT_KEYRING;
4573 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4574 }
4575
4576 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4577 * from it. */
4578 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4579
4580 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4581 * for it, and the kernel doesn't actually support ambient caps. */
4582 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4583
4584 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4585 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4586 * desired. */
4587 if (needs_ambient_hack)
4588 needs_setuid = false;
4589 else
4590 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4591
4592 uint64_t capability_ambient_set = context->capability_ambient_set;
4593
4594 if (needs_sandboxing) {
4595 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4596 * /sys being present. The actual MAC context application will happen later, as late as
4597 * possible, to avoid impacting our own code paths. */
4598
4599 #if HAVE_SELINUX
4600 use_selinux = mac_selinux_use();
4601 #endif
4602 #if ENABLE_SMACK
4603 use_smack = mac_smack_use();
4604 #endif
4605 #if HAVE_APPARMOR
4606 use_apparmor = mac_apparmor_use();
4607 #endif
4608 }
4609
4610 if (needs_sandboxing) {
4611 int which_failed;
4612
4613 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4614 * is set here. (See below.) */
4615
4616 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4617 if (r < 0) {
4618 *exit_status = EXIT_LIMITS;
4619 return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4620 }
4621 }
4622
4623 if (needs_setuid && context->pam_name && username) {
4624 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4625 * wins here. (See above.) */
4626
4627 /* All fds passed in the fds array will be closed in the pam child process. */
4628 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds, params->exec_fd);
4629 if (r < 0) {
4630 *exit_status = EXIT_PAM;
4631 return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4632 }
4633
4634 if (ambient_capabilities_supported()) {
4635 uint64_t ambient_after_pam;
4636
4637 /* PAM modules might have set some ambient caps. Query them here and merge them into
4638 * the caps we want to set in the end, so that we don't end up unsetting them. */
4639 r = capability_get_ambient(&ambient_after_pam);
4640 if (r < 0) {
4641 *exit_status = EXIT_CAPABILITIES;
4642 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4643 }
4644
4645 capability_ambient_set |= ambient_after_pam;
4646 }
4647
4648 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4649 if (ngids_after_pam < 0) {
4650 *exit_status = EXIT_GROUP;
4651 return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4652 }
4653 }
4654
4655 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4656 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4657 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4658 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4659
4660 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4661 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4662 * the actual requested operations fail (or silently continue). */
4663 if (r < 0 && context->private_users) {
4664 *exit_status = EXIT_USER;
4665 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4666 }
4667 if (r < 0)
4668 log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4669 else
4670 userns_set_up = true;
4671 }
4672
4673 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4674
4675 /* Try to enable network namespacing if network namespacing is available and we have
4676 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4677 * new network namespace. And if we don't have that, then we could only create a network
4678 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4679 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4680 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4681 if (ERRNO_IS_NEG_PRIVILEGE(r))
4682 log_exec_notice_errno(context, params, r,
4683 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4684 else if (r < 0) {
4685 *exit_status = EXIT_NETWORK;
4686 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4687 }
4688 } else if (context->network_namespace_path) {
4689 *exit_status = EXIT_NETWORK;
4690 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4691 "NetworkNamespacePath= is not supported, refusing.");
4692 } else
4693 log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4694 }
4695
4696 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4697
4698 if (ns_type_supported(NAMESPACE_IPC)) {
4699 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4700 if (r == -EPERM)
4701 log_exec_warning_errno(context, params, r,
4702 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4703 else if (r < 0) {
4704 *exit_status = EXIT_NAMESPACE;
4705 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4706 }
4707 } else if (context->ipc_namespace_path) {
4708 *exit_status = EXIT_NAMESPACE;
4709 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4710 "IPCNamespacePath= is not supported, refusing.");
4711 } else
4712 log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4713 }
4714
4715 if (needs_mount_namespace) {
4716 _cleanup_free_ char *error_path = NULL;
4717
4718 r = apply_mount_namespace(command->flags,
4719 context,
4720 params,
4721 runtime,
4722 memory_pressure_path,
4723 needs_sandboxing,
4724 &error_path);
4725 if (r < 0) {
4726 *exit_status = EXIT_NAMESPACE;
4727 return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4728 error_path ? ": " : "", strempty(error_path));
4729 }
4730 }
4731
4732 if (needs_sandboxing) {
4733 r = apply_protect_hostname(context, params, exit_status);
4734 if (r < 0)
4735 return r;
4736 }
4737
4738 if (context->memory_ksm >= 0)
4739 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm, 0, 0, 0) < 0) {
4740 if (ERRNO_IS_NOT_SUPPORTED(errno))
4741 log_exec_debug_errno(context,
4742 params,
4743 errno,
4744 "KSM support not available, ignoring.");
4745 else {
4746 *exit_status = EXIT_KSM;
4747 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4748 }
4749 }
4750
4751 /* Drop groups as early as possible.
4752 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4753 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4754 if (needs_setuid) {
4755 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4756 int ngids_to_enforce = 0;
4757
4758 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4759 ngids,
4760 gids_after_pam,
4761 ngids_after_pam,
4762 &gids_to_enforce);
4763 if (ngids_to_enforce < 0) {
4764 *exit_status = EXIT_GROUP;
4765 return log_exec_error_errno(context, params,
4766 ngids_to_enforce,
4767 "Failed to merge group lists. Group membership might be incorrect: %m");
4768 }
4769
4770 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4771 if (r < 0) {
4772 *exit_status = EXIT_GROUP;
4773 return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4774 }
4775 }
4776
4777 /* If the user namespace was not set up above, try to do it now.
4778 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4779 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4780 * case of mount namespaces being less privileged when the mount point list is copied from a
4781 * different user namespace). */
4782
4783 if (needs_sandboxing && context->private_users && !userns_set_up) {
4784 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4785 if (r < 0) {
4786 *exit_status = EXIT_USER;
4787 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4788 }
4789 }
4790
4791 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4792 * shall execute. */
4793
4794 _cleanup_free_ char *executable = NULL;
4795 _cleanup_close_ int executable_fd = -EBADF;
4796 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4797 if (r < 0) {
4798 *exit_status = EXIT_EXEC;
4799 log_exec_struct_errno(context, params, LOG_NOTICE, r,
4800 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4801 LOG_EXEC_MESSAGE(params,
4802 "Unable to locate executable '%s': %m",
4803 command->path),
4804 "EXECUTABLE=%s", command->path);
4805 /* If the error will be ignored by manager, tune down the log level here. Missing executable
4806 * is very much expected in this case. */
4807 return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
4808 }
4809
4810 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
4811 if (r < 0) {
4812 *exit_status = EXIT_FDS;
4813 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4814 }
4815
4816 #if HAVE_SELINUX
4817 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4818 int fd = -EBADF;
4819
4820 if (socket_fd >= 0)
4821 fd = socket_fd;
4822 else if (params->n_socket_fds == 1)
4823 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4824 * use context from that fd to compute the label. */
4825 fd = params->fds[0];
4826
4827 if (fd >= 0) {
4828 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4829 if (r < 0) {
4830 if (!context->selinux_context_ignore) {
4831 *exit_status = EXIT_SELINUX_CONTEXT;
4832 return log_exec_error_errno(context,
4833 params,
4834 r,
4835 "Failed to determine SELinux context: %m");
4836 }
4837 log_exec_debug_errno(context,
4838 params,
4839 r,
4840 "Failed to determine SELinux context, ignoring: %m");
4841 }
4842 }
4843 }
4844 #endif
4845
4846 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4847 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4848 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4849 * execve(). But first, close the remaining sockets in the context objects. */
4850
4851 exec_runtime_close(runtime);
4852 exec_params_close(params);
4853
4854 r = close_all_fds(keep_fds, n_keep_fds);
4855 if (r >= 0)
4856 r = pack_fds(params->fds, n_fds);
4857 if (r >= 0)
4858 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
4859 if (r < 0) {
4860 *exit_status = EXIT_FDS;
4861 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4862 }
4863
4864 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4865 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4866 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4867 * came this far. */
4868
4869 secure_bits = context->secure_bits;
4870
4871 if (needs_sandboxing) {
4872 uint64_t bset;
4873
4874 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4875 * (Note this is placed after the general resource limit initialization, see above, in order
4876 * to take precedence.) */
4877 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4878 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4879 *exit_status = EXIT_LIMITS;
4880 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4881 }
4882 }
4883
4884 #if ENABLE_SMACK
4885 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4886 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4887 if (use_smack && context->smack_process_label) {
4888 r = setup_smack(params, context, executable_fd);
4889 if (r < 0 && !context->smack_process_label_ignore) {
4890 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4891 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4892 }
4893 }
4894 #endif
4895
4896 bset = context->capability_bounding_set;
4897 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4898 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4899 * instead of us doing that */
4900 if (needs_ambient_hack)
4901 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4902 (UINT64_C(1) << CAP_SETUID) |
4903 (UINT64_C(1) << CAP_SETGID);
4904
4905 #if HAVE_SECCOMP
4906 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4907 * keep the needed privileges to apply it even if we're not root. */
4908 if (needs_setuid &&
4909 uid_is_valid(uid) &&
4910 context_has_seccomp(context) &&
4911 seccomp_allows_drop_privileges(context)) {
4912 keep_seccomp_privileges = true;
4913
4914 if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4915 *exit_status = EXIT_USER;
4916 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4917 }
4918
4919 /* Save the current bounding set so we can restore it after applying the seccomp
4920 * filter */
4921 saved_bset = bset;
4922 bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4923 (UINT64_C(1) << CAP_SETPCAP);
4924 }
4925 #endif
4926
4927 if (!cap_test_all(bset)) {
4928 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4929 if (r < 0) {
4930 *exit_status = EXIT_CAPABILITIES;
4931 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4932 }
4933 }
4934
4935 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4936 * keep-caps set.
4937 *
4938 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4939 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4940 * the ambient capabilities can be raised as they are present in the permitted and
4941 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4942 * without changing the user, so we also set the ambient capabilities here.
4943 *
4944 * The requested ambient capabilities are raised in the inheritable set if the second
4945 * argument is true. */
4946 if (!needs_ambient_hack) {
4947 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4948 if (r < 0) {
4949 *exit_status = EXIT_CAPABILITIES;
4950 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4951 }
4952 }
4953 }
4954
4955 /* chroot to root directory first, before we lose the ability to chroot */
4956 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4957 if (r < 0)
4958 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
4959
4960 if (needs_setuid) {
4961 if (uid_is_valid(uid)) {
4962 r = enforce_user(context, uid, capability_ambient_set);
4963 if (r < 0) {
4964 *exit_status = EXIT_USER;
4965 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
4966 }
4967
4968 if (keep_seccomp_privileges) {
4969 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
4970 r = drop_capability(CAP_SETUID);
4971 if (r < 0) {
4972 *exit_status = EXIT_USER;
4973 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
4974 }
4975 }
4976
4977 r = keep_capability(CAP_SYS_ADMIN);
4978 if (r < 0) {
4979 *exit_status = EXIT_USER;
4980 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
4981 }
4982
4983 r = keep_capability(CAP_SETPCAP);
4984 if (r < 0) {
4985 *exit_status = EXIT_USER;
4986 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
4987 }
4988 }
4989
4990 if (!needs_ambient_hack && capability_ambient_set != 0) {
4991
4992 /* Raise the ambient capabilities after user change. */
4993 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4994 if (r < 0) {
4995 *exit_status = EXIT_CAPABILITIES;
4996 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
4997 }
4998 }
4999 }
5000 }
5001
5002 /* Apply working directory here, because the working directory might be on NFS and only the user
5003 * running this service might have the correct privilege to change to the working directory. Also, it
5004 * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5005 * the cwd cannot be used to pin directories outside of the sandbox. */
5006 r = apply_working_directory(context, params, runtime, home, exit_status);
5007 if (r < 0)
5008 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
5009
5010 if (needs_sandboxing) {
5011 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5012 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5013 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5014 * are restricted. */
5015
5016 #if HAVE_SELINUX
5017 if (use_selinux) {
5018 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5019
5020 if (exec_context) {
5021 r = setexeccon(exec_context);
5022 if (r < 0) {
5023 if (!context->selinux_context_ignore) {
5024 *exit_status = EXIT_SELINUX_CONTEXT;
5025 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
5026 }
5027 log_exec_debug_errno(context,
5028 params,
5029 r,
5030 "Failed to change SELinux context to %s, ignoring: %m",
5031 exec_context);
5032 }
5033 }
5034 }
5035 #endif
5036
5037 #if HAVE_APPARMOR
5038 if (use_apparmor && context->apparmor_profile) {
5039 r = aa_change_onexec(context->apparmor_profile);
5040 if (r < 0 && !context->apparmor_profile_ignore) {
5041 *exit_status = EXIT_APPARMOR_PROFILE;
5042 return log_exec_error_errno(context,
5043 params,
5044 errno,
5045 "Failed to prepare AppArmor profile change to %s: %m",
5046 context->apparmor_profile);
5047 }
5048 }
5049 #endif
5050
5051 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5052 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5053 * requires CAP_SETPCAP. */
5054 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5055 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5056 * effective set here.
5057 *
5058 * The effective set is overwritten during execve() with the following values:
5059 *
5060 * - ambient set (for non-root processes)
5061 *
5062 * - (inheritable | bounding) set for root processes)
5063 *
5064 * Hence there is no security impact to raise it in the effective set before execve
5065 */
5066 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5067 if (r < 0) {
5068 *exit_status = EXIT_CAPABILITIES;
5069 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5070 }
5071 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5072 *exit_status = EXIT_SECUREBITS;
5073 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
5074 }
5075 }
5076
5077 if (context_has_no_new_privileges(context))
5078 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5079 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5080 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
5081 }
5082
5083 #if HAVE_SECCOMP
5084 r = apply_address_families(context, params);
5085 if (r < 0) {
5086 *exit_status = EXIT_ADDRESS_FAMILIES;
5087 return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
5088 }
5089
5090 r = apply_memory_deny_write_execute(context, params);
5091 if (r < 0) {
5092 *exit_status = EXIT_SECCOMP;
5093 return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5094 }
5095
5096 r = apply_restrict_realtime(context, params);
5097 if (r < 0) {
5098 *exit_status = EXIT_SECCOMP;
5099 return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5100 }
5101
5102 r = apply_restrict_suid_sgid(context, params);
5103 if (r < 0) {
5104 *exit_status = EXIT_SECCOMP;
5105 return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5106 }
5107
5108 r = apply_restrict_namespaces(context, params);
5109 if (r < 0) {
5110 *exit_status = EXIT_SECCOMP;
5111 return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5112 }
5113
5114 r = apply_protect_sysctl(context, params);
5115 if (r < 0) {
5116 *exit_status = EXIT_SECCOMP;
5117 return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5118 }
5119
5120 r = apply_protect_kernel_modules(context, params);
5121 if (r < 0) {
5122 *exit_status = EXIT_SECCOMP;
5123 return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5124 }
5125
5126 r = apply_protect_kernel_logs(context, params);
5127 if (r < 0) {
5128 *exit_status = EXIT_SECCOMP;
5129 return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5130 }
5131
5132 r = apply_protect_clock(context, params);
5133 if (r < 0) {
5134 *exit_status = EXIT_SECCOMP;
5135 return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5136 }
5137
5138 r = apply_private_devices(context, params);
5139 if (r < 0) {
5140 *exit_status = EXIT_SECCOMP;
5141 return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5142 }
5143
5144 r = apply_syscall_archs(context, params);
5145 if (r < 0) {
5146 *exit_status = EXIT_SECCOMP;
5147 return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5148 }
5149
5150 r = apply_lock_personality(context, params);
5151 if (r < 0) {
5152 *exit_status = EXIT_SECCOMP;
5153 return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5154 }
5155
5156 r = apply_syscall_log(context, params);
5157 if (r < 0) {
5158 *exit_status = EXIT_SECCOMP;
5159 return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5160 }
5161 #endif
5162
5163 #if HAVE_LIBBPF
5164 r = apply_restrict_filesystems(context, params);
5165 if (r < 0) {
5166 *exit_status = EXIT_BPF;
5167 return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5168 }
5169 #endif
5170
5171 #if HAVE_SECCOMP
5172 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5173 * by the filter as little as possible. */
5174 r = apply_syscall_filter(context, params, needs_ambient_hack);
5175 if (r < 0) {
5176 *exit_status = EXIT_SECCOMP;
5177 return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5178 }
5179
5180 if (keep_seccomp_privileges) {
5181 /* Restore the capability bounding set with what's expected from the service + the
5182 * ambient capabilities hack */
5183 if (!cap_test_all(saved_bset)) {
5184 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5185 if (r < 0) {
5186 *exit_status = EXIT_CAPABILITIES;
5187 return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5188 }
5189 }
5190
5191 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5192 * applications that use it. */
5193 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5194 r = drop_capability(CAP_SYS_ADMIN);
5195 if (r < 0) {
5196 *exit_status = EXIT_USER;
5197 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5198 }
5199 }
5200
5201 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5202 * applications that use it. */
5203 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5204 r = drop_capability(CAP_SETPCAP);
5205 if (r < 0) {
5206 *exit_status = EXIT_USER;
5207 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5208 }
5209 }
5210
5211 if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5212 *exit_status = EXIT_USER;
5213 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5214 }
5215 }
5216 #endif
5217
5218 }
5219
5220 if (!strv_isempty(context->unset_environment)) {
5221 char **ee = NULL;
5222
5223 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5224 if (!ee) {
5225 *exit_status = EXIT_MEMORY;
5226 return log_oom();
5227 }
5228
5229 strv_free_and_replace(accum_env, ee);
5230 }
5231
5232 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5233 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5234
5235 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5236 if (r < 0) {
5237 *exit_status = EXIT_MEMORY;
5238 return log_exec_error_errno(context,
5239 params,
5240 r,
5241 "Failed to replace environment variables: %m");
5242 }
5243 final_argv = replaced_argv;
5244
5245 if (!strv_isempty(unset_variables)) {
5246 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5247 log_exec_warning(context,
5248 params,
5249 "Referenced but unset environment variable evaluates to an empty string: %s",
5250 strna(ju));
5251 }
5252
5253 if (!strv_isempty(bad_variables)) {
5254 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5255 log_exec_warning(context,
5256 params,
5257 "Invalid environment variable name evaluates to an empty string: %s",
5258 strna(jb));
5259 }
5260 } else
5261 final_argv = command->argv;
5262
5263 log_command_line(context, params, "Executing", executable, final_argv);
5264
5265 if (params->exec_fd >= 0) {
5266 uint8_t hot = 1;
5267
5268 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5269 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5270
5271 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5272 *exit_status = EXIT_EXEC;
5273 return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5274 }
5275 }
5276
5277 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5278
5279 if (params->exec_fd >= 0) {
5280 uint8_t hot = 0;
5281
5282 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5283 * that POLLHUP on it no longer means execve() succeeded. */
5284
5285 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5286 *exit_status = EXIT_EXEC;
5287 return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5288 }
5289 }
5290
5291 *exit_status = EXIT_EXEC;
5292 return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5293 }