]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/exec-invoke.c
exec-invoke: extend comment on placement of apply_working_directory() call
[thirdparty/systemd.git] / src / core / exec-invoke.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/eventfd.h>
4 #include <sys/ioctl.h>
5 #include <sys/mount.h>
6 #include <sys/prctl.h>
7
8 #if HAVE_PAM
9 #include <security/pam_appl.h>
10 #include <security/pam_misc.h>
11 #endif
12
13 #if HAVE_APPARMOR
14 #include <sys/apparmor.h>
15 #endif
16
17 #include "sd-messages.h"
18
19 #if HAVE_APPARMOR
20 #include "apparmor-util.h"
21 #endif
22 #include "argv-util.h"
23 #include "barrier.h"
24 #include "bpf-dlopen.h"
25 #include "bpf-restrict-fs.h"
26 #include "btrfs-util.h"
27 #include "capability-util.h"
28 #include "cgroup-setup.h"
29 #include "chase.h"
30 #include "chattr-util.h"
31 #include "chown-recursive.h"
32 #include "copy.h"
33 #include "data-fd-util.h"
34 #include "env-util.h"
35 #include "escape.h"
36 #include "exec-credential.h"
37 #include "exec-invoke.h"
38 #include "execute.h"
39 #include "exit-status.h"
40 #include "fd-util.h"
41 #include "hexdecoct.h"
42 #include "io-util.h"
43 #include "iovec-util.h"
44 #include "missing_ioprio.h"
45 #include "missing_prctl.h"
46 #include "missing_securebits.h"
47 #include "missing_syscall.h"
48 #include "mkdir-label.h"
49 #include "proc-cmdline.h"
50 #include "process-util.h"
51 #include "psi-util.h"
52 #include "rlimit-util.h"
53 #include "seccomp-util.h"
54 #include "selinux-util.h"
55 #include "signal-util.h"
56 #include "smack-util.h"
57 #include "socket-util.h"
58 #include "string-table.h"
59 #include "strv.h"
60 #include "terminal-util.h"
61 #include "utmp-wtmp.h"
62 #include "vpick.h"
63
64 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
65 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
66
67 #define SNDBUF_SIZE (8*1024*1024)
68
69 static int shift_fds(int fds[], size_t n_fds) {
70 if (n_fds <= 0)
71 return 0;
72
73 /* Modifies the fds array! (sorts it) */
74
75 assert(fds);
76
77 for (int start = 0;;) {
78 int restart_from = -1;
79
80 for (int i = start; i < (int) n_fds; i++) {
81 int nfd;
82
83 /* Already at right index? */
84 if (fds[i] == i+3)
85 continue;
86
87 nfd = fcntl(fds[i], F_DUPFD, i + 3);
88 if (nfd < 0)
89 return -errno;
90
91 safe_close(fds[i]);
92 fds[i] = nfd;
93
94 /* Hmm, the fd we wanted isn't free? Then
95 * let's remember that and try again from here */
96 if (nfd != i+3 && restart_from < 0)
97 restart_from = i;
98 }
99
100 if (restart_from < 0)
101 break;
102
103 start = restart_from;
104 }
105
106 return 0;
107 }
108
109 static int flag_fds(
110 const int fds[],
111 size_t n_socket_fds,
112 size_t n_fds,
113 bool nonblock) {
114
115 int r;
116
117 assert(fds || n_fds == 0);
118
119 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
120 * O_NONBLOCK only applies to socket activation though. */
121
122 for (size_t i = 0; i < n_fds; i++) {
123
124 if (i < n_socket_fds) {
125 r = fd_nonblock(fds[i], nonblock);
126 if (r < 0)
127 return r;
128 }
129
130 /* We unconditionally drop FD_CLOEXEC from the fds,
131 * since after all we want to pass these fds to our
132 * children */
133
134 r = fd_cloexec(fds[i], false);
135 if (r < 0)
136 return r;
137 }
138
139 return 0;
140 }
141
142 static bool is_terminal_input(ExecInput i) {
143 return IN_SET(i,
144 EXEC_INPUT_TTY,
145 EXEC_INPUT_TTY_FORCE,
146 EXEC_INPUT_TTY_FAIL);
147 }
148
149 static bool is_terminal_output(ExecOutput o) {
150 return IN_SET(o,
151 EXEC_OUTPUT_TTY,
152 EXEC_OUTPUT_KMSG_AND_CONSOLE,
153 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
154 }
155
156 static bool is_kmsg_output(ExecOutput o) {
157 return IN_SET(o,
158 EXEC_OUTPUT_KMSG,
159 EXEC_OUTPUT_KMSG_AND_CONSOLE);
160 }
161
162 static bool exec_context_needs_term(const ExecContext *c) {
163 assert(c);
164
165 /* Return true if the execution context suggests we should set $TERM to something useful. */
166
167 if (is_terminal_input(c->std_input))
168 return true;
169
170 if (is_terminal_output(c->std_output))
171 return true;
172
173 if (is_terminal_output(c->std_error))
174 return true;
175
176 return !!c->tty_path;
177 }
178
179 static int open_null_as(int flags, int nfd) {
180 int fd;
181
182 assert(nfd >= 0);
183
184 fd = open("/dev/null", flags|O_NOCTTY);
185 if (fd < 0)
186 return -errno;
187
188 return move_fd(fd, nfd, false);
189 }
190
191 static int connect_journal_socket(
192 int fd,
193 const char *log_namespace,
194 uid_t uid,
195 gid_t gid) {
196
197 uid_t olduid = UID_INVALID;
198 gid_t oldgid = GID_INVALID;
199 const char *j;
200 int r;
201
202 j = log_namespace ?
203 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
204 "/run/systemd/journal/stdout";
205
206 if (gid_is_valid(gid)) {
207 oldgid = getgid();
208
209 if (setegid(gid) < 0)
210 return -errno;
211 }
212
213 if (uid_is_valid(uid)) {
214 olduid = getuid();
215
216 if (seteuid(uid) < 0) {
217 r = -errno;
218 goto restore_gid;
219 }
220 }
221
222 r = connect_unix_path(fd, AT_FDCWD, j);
223
224 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
225 an LSM interferes. */
226
227 if (uid_is_valid(uid))
228 (void) seteuid(olduid);
229
230 restore_gid:
231 if (gid_is_valid(gid))
232 (void) setegid(oldgid);
233
234 return r;
235 }
236
237 static int connect_logger_as(
238 const ExecContext *context,
239 const ExecParameters *params,
240 ExecOutput output,
241 const char *ident,
242 int nfd,
243 uid_t uid,
244 gid_t gid) {
245
246 _cleanup_close_ int fd = -EBADF;
247 int r;
248
249 assert(context);
250 assert(params);
251 assert(output < _EXEC_OUTPUT_MAX);
252 assert(ident);
253 assert(nfd >= 0);
254
255 fd = socket(AF_UNIX, SOCK_STREAM, 0);
256 if (fd < 0)
257 return -errno;
258
259 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
260 if (r < 0)
261 return r;
262
263 if (shutdown(fd, SHUT_RD) < 0)
264 return -errno;
265
266 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
267
268 if (dprintf(fd,
269 "%s\n"
270 "%s\n"
271 "%i\n"
272 "%i\n"
273 "%i\n"
274 "%i\n"
275 "%i\n",
276 context->syslog_identifier ?: ident,
277 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
278 context->syslog_priority,
279 !!context->syslog_level_prefix,
280 false,
281 is_kmsg_output(output),
282 is_terminal_output(output)) < 0)
283 return -errno;
284
285 return move_fd(TAKE_FD(fd), nfd, false);
286 }
287
288 static int open_terminal_as(const char *path, int flags, int nfd) {
289 int fd;
290
291 assert(path);
292 assert(nfd >= 0);
293
294 fd = open_terminal(path, flags | O_NOCTTY);
295 if (fd < 0)
296 return fd;
297
298 return move_fd(fd, nfd, false);
299 }
300
301 static int acquire_path(const char *path, int flags, mode_t mode) {
302 _cleanup_close_ int fd = -EBADF;
303 int r;
304
305 assert(path);
306
307 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
308 flags |= O_CREAT;
309
310 fd = open(path, flags|O_NOCTTY, mode);
311 if (fd >= 0)
312 return TAKE_FD(fd);
313
314 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
315 return -errno;
316
317 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
318
319 fd = socket(AF_UNIX, SOCK_STREAM, 0);
320 if (fd < 0)
321 return -errno;
322
323 r = connect_unix_path(fd, AT_FDCWD, path);
324 if (IN_SET(r, -ENOTSOCK, -EINVAL))
325 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
326 * wasn't an AF_UNIX socket after all */
327 return -ENXIO;
328 if (r < 0)
329 return r;
330
331 if ((flags & O_ACCMODE) == O_RDONLY)
332 r = shutdown(fd, SHUT_WR);
333 else if ((flags & O_ACCMODE) == O_WRONLY)
334 r = shutdown(fd, SHUT_RD);
335 else
336 r = 0;
337 if (r < 0)
338 return -errno;
339
340 return TAKE_FD(fd);
341 }
342
343 static int fixup_input(
344 const ExecContext *context,
345 int socket_fd,
346 bool apply_tty_stdin) {
347
348 ExecInput std_input;
349
350 assert(context);
351
352 std_input = context->std_input;
353
354 if (is_terminal_input(std_input) && !apply_tty_stdin)
355 return EXEC_INPUT_NULL;
356
357 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
358 return EXEC_INPUT_NULL;
359
360 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
361 return EXEC_INPUT_NULL;
362
363 return std_input;
364 }
365
366 static int fixup_output(ExecOutput output, int socket_fd) {
367
368 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
369 return EXEC_OUTPUT_INHERIT;
370
371 return output;
372 }
373
374 static int setup_input(
375 const ExecContext *context,
376 const ExecParameters *params,
377 int socket_fd,
378 const int named_iofds[static 3]) {
379
380 ExecInput i;
381 int r;
382
383 assert(context);
384 assert(params);
385 assert(named_iofds);
386
387 if (params->stdin_fd >= 0) {
388 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
389 return -errno;
390
391 /* Try to make this the controlling tty, if it is a tty, and reset it */
392 if (isatty(STDIN_FILENO)) {
393 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
394
395 if (context->tty_reset)
396 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
397
398 (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
399 }
400
401 return STDIN_FILENO;
402 }
403
404 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
405
406 switch (i) {
407
408 case EXEC_INPUT_NULL:
409 return open_null_as(O_RDONLY, STDIN_FILENO);
410
411 case EXEC_INPUT_TTY:
412 case EXEC_INPUT_TTY_FORCE:
413 case EXEC_INPUT_TTY_FAIL: {
414 _cleanup_close_ int tty_fd = -EBADF;
415 const char *tty_path;
416
417 tty_path = ASSERT_PTR(exec_context_tty_path(context));
418
419 tty_fd = acquire_terminal(tty_path,
420 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
421 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
422 ACQUIRE_TERMINAL_WAIT,
423 USEC_INFINITY);
424 if (tty_fd < 0)
425 return tty_fd;
426
427 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
428 if (r < 0)
429 return r;
430
431 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
432 if (r < 0)
433 return r;
434
435 TAKE_FD(tty_fd);
436 return r;
437 }
438
439 case EXEC_INPUT_SOCKET:
440 assert(socket_fd >= 0);
441
442 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
443
444 case EXEC_INPUT_NAMED_FD:
445 assert(named_iofds[STDIN_FILENO] >= 0);
446
447 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
448 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
449
450 case EXEC_INPUT_DATA: {
451 int fd;
452
453 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
454 if (fd < 0)
455 return fd;
456
457 return move_fd(fd, STDIN_FILENO, false);
458 }
459
460 case EXEC_INPUT_FILE: {
461 bool rw;
462 int fd;
463
464 assert(context->stdio_file[STDIN_FILENO]);
465
466 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
467 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
468
469 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
470 if (fd < 0)
471 return fd;
472
473 return move_fd(fd, STDIN_FILENO, false);
474 }
475
476 default:
477 assert_not_reached();
478 }
479 }
480
481 static bool can_inherit_stderr_from_stdout(
482 const ExecContext *context,
483 ExecOutput o,
484 ExecOutput e) {
485
486 assert(context);
487
488 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
489 * stderr fd */
490
491 if (e == EXEC_OUTPUT_INHERIT)
492 return true;
493 if (e != o)
494 return false;
495
496 if (e == EXEC_OUTPUT_NAMED_FD)
497 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
498
499 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
500 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
501
502 return true;
503 }
504
505 static int setup_output(
506 const ExecContext *context,
507 const ExecParameters *params,
508 int fileno,
509 int socket_fd,
510 const int named_iofds[static 3],
511 const char *ident,
512 uid_t uid,
513 gid_t gid,
514 dev_t *journal_stream_dev,
515 ino_t *journal_stream_ino) {
516
517 ExecOutput o;
518 ExecInput i;
519 int r;
520
521 assert(context);
522 assert(params);
523 assert(ident);
524 assert(journal_stream_dev);
525 assert(journal_stream_ino);
526
527 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
528
529 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
530 return -errno;
531
532 return STDOUT_FILENO;
533 }
534
535 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
536 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
537 return -errno;
538
539 return STDERR_FILENO;
540 }
541
542 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
543 o = fixup_output(context->std_output, socket_fd);
544
545 if (fileno == STDERR_FILENO) {
546 ExecOutput e;
547 e = fixup_output(context->std_error, socket_fd);
548
549 /* This expects the input and output are already set up */
550
551 /* Don't change the stderr file descriptor if we inherit all
552 * the way and are not on a tty */
553 if (e == EXEC_OUTPUT_INHERIT &&
554 o == EXEC_OUTPUT_INHERIT &&
555 i == EXEC_INPUT_NULL &&
556 !is_terminal_input(context->std_input) &&
557 getppid() != 1)
558 return fileno;
559
560 /* Duplicate from stdout if possible */
561 if (can_inherit_stderr_from_stdout(context, o, e))
562 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
563
564 o = e;
565
566 } else if (o == EXEC_OUTPUT_INHERIT) {
567 /* If input got downgraded, inherit the original value */
568 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
569 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
570
571 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
572 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
573 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
574
575 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
576 if (getppid() != 1)
577 return fileno;
578
579 /* We need to open /dev/null here anew, to get the right access mode. */
580 return open_null_as(O_WRONLY, fileno);
581 }
582
583 switch (o) {
584
585 case EXEC_OUTPUT_NULL:
586 return open_null_as(O_WRONLY, fileno);
587
588 case EXEC_OUTPUT_TTY:
589 if (is_terminal_input(i))
590 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
591
592 /* We don't reset the terminal if this is just about output */
593 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
594
595 case EXEC_OUTPUT_KMSG:
596 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
597 case EXEC_OUTPUT_JOURNAL:
598 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
599 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
600 if (r < 0) {
601 log_exec_warning_errno(context,
602 params,
603 r,
604 "Failed to connect %s to the journal socket, ignoring: %m",
605 fileno == STDOUT_FILENO ? "stdout" : "stderr");
606 r = open_null_as(O_WRONLY, fileno);
607 } else {
608 struct stat st;
609
610 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
611 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
612 * services to detect whether they are connected to the journal or not.
613 *
614 * If both stdout and stderr are connected to a stream then let's make sure to store the data
615 * about STDERR as that's usually the best way to do logging. */
616
617 if (fstat(fileno, &st) >= 0 &&
618 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
619 *journal_stream_dev = st.st_dev;
620 *journal_stream_ino = st.st_ino;
621 }
622 }
623 return r;
624
625 case EXEC_OUTPUT_SOCKET:
626 assert(socket_fd >= 0);
627
628 return RET_NERRNO(dup2(socket_fd, fileno));
629
630 case EXEC_OUTPUT_NAMED_FD:
631 assert(named_iofds[fileno] >= 0);
632
633 (void) fd_nonblock(named_iofds[fileno], false);
634 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
635
636 case EXEC_OUTPUT_FILE:
637 case EXEC_OUTPUT_FILE_APPEND:
638 case EXEC_OUTPUT_FILE_TRUNCATE: {
639 bool rw;
640 int fd, flags;
641
642 assert(context->stdio_file[fileno]);
643
644 rw = context->std_input == EXEC_INPUT_FILE &&
645 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
646
647 if (rw)
648 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
649
650 flags = O_WRONLY;
651 if (o == EXEC_OUTPUT_FILE_APPEND)
652 flags |= O_APPEND;
653 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
654 flags |= O_TRUNC;
655
656 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
657 if (fd < 0)
658 return fd;
659
660 return move_fd(fd, fileno, 0);
661 }
662
663 default:
664 assert_not_reached();
665 }
666 }
667
668 static int chown_terminal(int fd, uid_t uid) {
669 int r;
670
671 assert(fd >= 0);
672
673 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
674 if (!isatty_safe(fd))
675 return 0;
676
677 /* This might fail. What matters are the results. */
678 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
679 if (r < 0)
680 return r;
681
682 return 1;
683 }
684
685 static int setup_confirm_stdio(
686 const ExecContext *context,
687 const char *vc,
688 int *ret_saved_stdin,
689 int *ret_saved_stdout) {
690
691 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
692 int r;
693
694 assert(ret_saved_stdin);
695 assert(ret_saved_stdout);
696
697 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
698 if (saved_stdin < 0)
699 return -errno;
700
701 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
702 if (saved_stdout < 0)
703 return -errno;
704
705 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
706 if (fd < 0)
707 return fd;
708
709 r = chown_terminal(fd, getuid());
710 if (r < 0)
711 return r;
712
713 r = reset_terminal_fd(fd, /* switch_to_text= */ true);
714 if (r < 0)
715 return r;
716
717 r = exec_context_apply_tty_size(context, fd, vc);
718 if (r < 0)
719 return r;
720
721 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
722 TAKE_FD(fd);
723 if (r < 0)
724 return r;
725
726 *ret_saved_stdin = TAKE_FD(saved_stdin);
727 *ret_saved_stdout = TAKE_FD(saved_stdout);
728 return 0;
729 }
730
731 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
732 assert(err < 0);
733 assert(unit_id);
734
735 if (err == -ETIMEDOUT)
736 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
737 else {
738 errno = -err;
739 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
740 }
741 }
742
743 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
744 _cleanup_close_ int fd = -EBADF;
745
746 assert(vc);
747
748 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
749 if (fd < 0)
750 return;
751
752 write_confirm_error_fd(err, fd, unit_id);
753 }
754
755 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
756 int r = 0;
757
758 assert(saved_stdin);
759 assert(saved_stdout);
760
761 release_terminal();
762
763 if (*saved_stdin >= 0)
764 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
765 r = -errno;
766
767 if (*saved_stdout >= 0)
768 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
769 r = -errno;
770
771 *saved_stdin = safe_close(*saved_stdin);
772 *saved_stdout = safe_close(*saved_stdout);
773
774 return r;
775 }
776
777 enum {
778 CONFIRM_PRETEND_FAILURE = -1,
779 CONFIRM_PRETEND_SUCCESS = 0,
780 CONFIRM_EXECUTE = 1,
781 };
782
783 static bool confirm_spawn_disabled(void) {
784 return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
785 }
786
787 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
788 int saved_stdout = -1, saved_stdin = -1, r;
789 _cleanup_free_ char *e = NULL;
790 char c;
791
792 assert(context);
793 assert(params);
794
795 /* For any internal errors, assume a positive response. */
796 r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
797 if (r < 0) {
798 write_confirm_error(r, params->confirm_spawn, params->unit_id);
799 return CONFIRM_EXECUTE;
800 }
801
802 /* confirm_spawn might have been disabled while we were sleeping. */
803 if (!params->confirm_spawn || confirm_spawn_disabled()) {
804 r = 1;
805 goto restore_stdio;
806 }
807
808 e = ellipsize(cmdline, 60, 100);
809 if (!e) {
810 log_oom();
811 r = CONFIRM_EXECUTE;
812 goto restore_stdio;
813 }
814
815 for (;;) {
816 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
817 if (r < 0) {
818 write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
819 r = CONFIRM_EXECUTE;
820 goto restore_stdio;
821 }
822
823 switch (c) {
824 case 'c':
825 printf("Resuming normal execution.\n");
826 manager_disable_confirm_spawn();
827 r = 1;
828 break;
829 case 'D':
830 printf(" Unit: %s\n",
831 params->unit_id);
832 exec_context_dump(context, stdout, " ");
833 exec_params_dump(params, stdout, " ");
834 continue; /* ask again */
835 case 'f':
836 printf("Failing execution.\n");
837 r = CONFIRM_PRETEND_FAILURE;
838 break;
839 case 'h':
840 printf(" c - continue, proceed without asking anymore\n"
841 " D - dump, show the state of the unit\n"
842 " f - fail, don't execute the command and pretend it failed\n"
843 " h - help\n"
844 " i - info, show a short summary of the unit\n"
845 " j - jobs, show jobs that are in progress\n"
846 " s - skip, don't execute the command and pretend it succeeded\n"
847 " y - yes, execute the command\n");
848 continue; /* ask again */
849 case 'i':
850 printf(" Unit: %s\n"
851 " Command: %s\n",
852 params->unit_id, cmdline);
853 continue; /* ask again */
854 case 'j':
855 if (sigqueue(getppid(),
856 SIGRTMIN+18,
857 (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
858 return -errno;
859
860 continue; /* ask again */
861 case 'n':
862 /* 'n' was removed in favor of 'f'. */
863 printf("Didn't understand 'n', did you mean 'f'?\n");
864 continue; /* ask again */
865 case 's':
866 printf("Skipping execution.\n");
867 r = CONFIRM_PRETEND_SUCCESS;
868 break;
869 case 'y':
870 r = CONFIRM_EXECUTE;
871 break;
872 default:
873 assert_not_reached();
874 }
875 break;
876 }
877
878 restore_stdio:
879 restore_confirm_stdio(&saved_stdin, &saved_stdout);
880 return r;
881 }
882
883 static int get_fixed_user(
884 const char *user_or_uid,
885 const char **ret_username,
886 uid_t *ret_uid,
887 gid_t *ret_gid,
888 const char **ret_home,
889 const char **ret_shell) {
890
891 int r;
892
893 assert(user_or_uid);
894 assert(ret_username);
895
896 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
897 * (i.e. are "/" or "/bin/nologin"). */
898
899 r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
900 if (r < 0)
901 return r;
902
903 /* user_or_uid is normalized by get_user_creds to username */
904 *ret_username = user_or_uid;
905
906 return 0;
907 }
908
909 static int get_fixed_group(
910 const char *group_or_gid,
911 const char **ret_groupname,
912 gid_t *ret_gid) {
913
914 int r;
915
916 assert(group_or_gid);
917 assert(ret_groupname);
918
919 r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
920 if (r < 0)
921 return r;
922
923 /* group_or_gid is normalized by get_group_creds to groupname */
924 *ret_groupname = group_or_gid;
925
926 return 0;
927 }
928
929 static int get_supplementary_groups(const ExecContext *c, const char *user,
930 const char *group, gid_t gid,
931 gid_t **supplementary_gids, int *ngids) {
932 int r, k = 0;
933 int ngroups_max;
934 bool keep_groups = false;
935 gid_t *groups = NULL;
936 _cleanup_free_ gid_t *l_gids = NULL;
937
938 assert(c);
939
940 /*
941 * If user is given, then lookup GID and supplementary groups list.
942 * We avoid NSS lookups for gid=0. Also we have to initialize groups
943 * here and as early as possible so we keep the list of supplementary
944 * groups of the caller.
945 */
946 if (user && gid_is_valid(gid) && gid != 0) {
947 /* First step, initialize groups from /etc/groups */
948 if (initgroups(user, gid) < 0)
949 return -errno;
950
951 keep_groups = true;
952 }
953
954 if (strv_isempty(c->supplementary_groups))
955 return 0;
956
957 /*
958 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
959 * be positive, otherwise fail.
960 */
961 errno = 0;
962 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
963 if (ngroups_max <= 0)
964 return errno_or_else(EOPNOTSUPP);
965
966 l_gids = new(gid_t, ngroups_max);
967 if (!l_gids)
968 return -ENOMEM;
969
970 if (keep_groups) {
971 /*
972 * Lookup the list of groups that the user belongs to, we
973 * avoid NSS lookups here too for gid=0.
974 */
975 k = ngroups_max;
976 if (getgrouplist(user, gid, l_gids, &k) < 0)
977 return -EINVAL;
978 } else
979 k = 0;
980
981 STRV_FOREACH(i, c->supplementary_groups) {
982 const char *g;
983
984 if (k >= ngroups_max)
985 return -E2BIG;
986
987 g = *i;
988 r = get_group_creds(&g, l_gids+k, 0);
989 if (r < 0)
990 return r;
991
992 k++;
993 }
994
995 /*
996 * Sets ngids to zero to drop all supplementary groups, happens
997 * when we are under root and SupplementaryGroups= is empty.
998 */
999 if (k == 0) {
1000 *ngids = 0;
1001 return 0;
1002 }
1003
1004 /* Otherwise get the final list of supplementary groups */
1005 groups = memdup(l_gids, sizeof(gid_t) * k);
1006 if (!groups)
1007 return -ENOMEM;
1008
1009 *supplementary_gids = groups;
1010 *ngids = k;
1011
1012 groups = NULL;
1013
1014 return 0;
1015 }
1016
1017 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1018 int r;
1019
1020 /* Handle SupplementaryGroups= if it is not empty */
1021 if (ngids > 0) {
1022 r = maybe_setgroups(ngids, supplementary_gids);
1023 if (r < 0)
1024 return r;
1025 }
1026
1027 if (gid_is_valid(gid)) {
1028 /* Then set our gids */
1029 if (setresgid(gid, gid, gid) < 0)
1030 return -errno;
1031 }
1032
1033 return 0;
1034 }
1035
1036 static int set_securebits(unsigned bits, unsigned mask) {
1037 unsigned applied;
1038 int current;
1039
1040 current = prctl(PR_GET_SECUREBITS);
1041 if (current < 0)
1042 return -errno;
1043
1044 /* Clear all securebits defined in mask and set bits */
1045 applied = ((unsigned) current & ~mask) | bits;
1046 if ((unsigned) current == applied)
1047 return 0;
1048
1049 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1050 return -errno;
1051
1052 return 1;
1053 }
1054
1055 static int enforce_user(
1056 const ExecContext *context,
1057 uid_t uid,
1058 uint64_t capability_ambient_set) {
1059 assert(context);
1060 int r;
1061
1062 if (!uid_is_valid(uid))
1063 return 0;
1064
1065 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1066 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1067 * case. */
1068
1069 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1070
1071 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1072 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1073 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1074 if (r < 0)
1075 return r;
1076 }
1077
1078 /* Second step: actually set the uids */
1079 if (setresuid(uid, uid, uid) < 0)
1080 return -errno;
1081
1082 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1083 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1084 * outside of this call. */
1085 return 0;
1086 }
1087
1088 #if HAVE_PAM
1089
1090 static int null_conv(
1091 int num_msg,
1092 const struct pam_message **msg,
1093 struct pam_response **resp,
1094 void *appdata_ptr) {
1095
1096 /* We don't support conversations */
1097
1098 return PAM_CONV_ERR;
1099 }
1100
1101 static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
1102 int r, s;
1103
1104 assert(handle);
1105
1106 r = pam_close_session(handle, flags);
1107 if (r != PAM_SUCCESS)
1108 log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
1109
1110 s = pam_setcred(handle, PAM_DELETE_CRED | flags);
1111 if (s != PAM_SUCCESS)
1112 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
1113
1114 return r != PAM_SUCCESS ? r : s;
1115 }
1116
1117 #endif
1118
1119 static int setup_pam(
1120 const char *name,
1121 const char *user,
1122 uid_t uid,
1123 gid_t gid,
1124 const char *tty,
1125 char ***env, /* updated on success */
1126 const int fds[], size_t n_fds,
1127 int exec_fd) {
1128
1129 #if HAVE_PAM
1130
1131 static const struct pam_conv conv = {
1132 .conv = null_conv,
1133 .appdata_ptr = NULL
1134 };
1135
1136 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1137 _cleanup_strv_free_ char **e = NULL;
1138 pam_handle_t *handle = NULL;
1139 sigset_t old_ss;
1140 int pam_code = PAM_SUCCESS, r;
1141 bool close_session = false;
1142 pid_t parent_pid;
1143 int flags = 0;
1144
1145 assert(name);
1146 assert(user);
1147 assert(env);
1148
1149 /* We set up PAM in the parent process, then fork. The child
1150 * will then stay around until killed via PR_GET_PDEATHSIG or
1151 * systemd via the cgroup logic. It will then remove the PAM
1152 * session again. The parent process will exec() the actual
1153 * daemon. We do things this way to ensure that the main PID
1154 * of the daemon is the one we initially fork()ed. */
1155
1156 r = barrier_create(&barrier);
1157 if (r < 0)
1158 goto fail;
1159
1160 if (log_get_max_level() < LOG_DEBUG)
1161 flags |= PAM_SILENT;
1162
1163 pam_code = pam_start(name, user, &conv, &handle);
1164 if (pam_code != PAM_SUCCESS) {
1165 handle = NULL;
1166 goto fail;
1167 }
1168
1169 if (!tty) {
1170 _cleanup_free_ char *q = NULL;
1171
1172 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1173 * out if that's the case, and read the TTY off it. */
1174
1175 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1176 tty = strjoina("/dev/", q);
1177 }
1178
1179 if (tty) {
1180 pam_code = pam_set_item(handle, PAM_TTY, tty);
1181 if (pam_code != PAM_SUCCESS)
1182 goto fail;
1183 }
1184
1185 STRV_FOREACH(nv, *env) {
1186 pam_code = pam_putenv(handle, *nv);
1187 if (pam_code != PAM_SUCCESS)
1188 goto fail;
1189 }
1190
1191 pam_code = pam_acct_mgmt(handle, flags);
1192 if (pam_code != PAM_SUCCESS)
1193 goto fail;
1194
1195 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1196 if (pam_code != PAM_SUCCESS)
1197 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
1198
1199 pam_code = pam_open_session(handle, flags);
1200 if (pam_code != PAM_SUCCESS)
1201 goto fail;
1202
1203 close_session = true;
1204
1205 e = pam_getenvlist(handle);
1206 if (!e) {
1207 pam_code = PAM_BUF_ERR;
1208 goto fail;
1209 }
1210
1211 /* Block SIGTERM, so that we know that it won't get lost in the child */
1212
1213 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1214
1215 parent_pid = getpid_cached();
1216
1217 r = safe_fork("(sd-pam)", 0, NULL);
1218 if (r < 0)
1219 goto fail;
1220 if (r == 0) {
1221 int ret = EXIT_PAM;
1222
1223 /* The child's job is to reset the PAM session on termination */
1224 barrier_set_role(&barrier, BARRIER_CHILD);
1225
1226 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1227 * those fds are open here that have been opened by PAM. */
1228 (void) close_many(fds, n_fds);
1229
1230 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1231 * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1232 * we'd never signal completion. */
1233 exec_fd = safe_close(exec_fd);
1234
1235 /* Drop privileges - we don't need any to pam_close_session and this will make
1236 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1237 * threads to fail to exit normally */
1238
1239 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1240 if (r < 0)
1241 log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1242
1243 (void) ignore_signals(SIGPIPE);
1244
1245 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1246 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1247 * this way. We rely on the control groups kill logic to do the rest for us. */
1248 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1249 goto child_finish;
1250
1251 /* Tell the parent that our setup is done. This is especially important regarding dropping
1252 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1253 *
1254 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1255 (void) barrier_place(&barrier);
1256
1257 /* Check if our parent process might already have died? */
1258 if (getppid() == parent_pid) {
1259 sigset_t ss;
1260 int sig;
1261
1262 assert_se(sigemptyset(&ss) >= 0);
1263 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1264
1265 assert_se(sigwait(&ss, &sig) == 0);
1266 assert(sig == SIGTERM);
1267 }
1268
1269 /* If our parent died we'll end the session */
1270 if (getppid() != parent_pid) {
1271 pam_code = pam_close_session_and_delete_credentials(handle, flags);
1272 if (pam_code != PAM_SUCCESS)
1273 goto child_finish;
1274 }
1275
1276 ret = 0;
1277
1278 child_finish:
1279 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1280 * know about this. See pam_end(3) */
1281 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1282 _exit(ret);
1283 }
1284
1285 barrier_set_role(&barrier, BARRIER_PARENT);
1286
1287 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1288 * here. */
1289 handle = NULL;
1290
1291 /* Unblock SIGTERM again in the parent */
1292 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1293
1294 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1295 * this fd around. */
1296 closelog();
1297
1298 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1299 * recover. However, warn loudly if it happens. */
1300 if (!barrier_place_and_sync(&barrier))
1301 log_error("PAM initialization failed");
1302
1303 return strv_free_and_replace(*env, e);
1304
1305 fail:
1306 if (pam_code != PAM_SUCCESS) {
1307 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1308 r = -EPERM; /* PAM errors do not map to errno */
1309 } else
1310 log_error_errno(r, "PAM failed: %m");
1311
1312 if (handle) {
1313 if (close_session)
1314 pam_code = pam_close_session_and_delete_credentials(handle, flags);
1315
1316 (void) pam_end(handle, pam_code | flags);
1317 }
1318
1319 closelog();
1320 return r;
1321 #else
1322 return 0;
1323 #endif
1324 }
1325
1326 static void rename_process_from_path(const char *path) {
1327 _cleanup_free_ char *buf = NULL;
1328 const char *p;
1329
1330 assert(path);
1331
1332 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1333 * /bin/ps */
1334
1335 if (path_extract_filename(path, &buf) < 0) {
1336 rename_process("(...)");
1337 return;
1338 }
1339
1340 size_t l = strlen(buf);
1341 if (l > 8) {
1342 /* The end of the process name is usually more interesting, since the first bit might just be
1343 * "systemd-" */
1344 p = buf + l - 8;
1345 l = 8;
1346 } else
1347 p = buf;
1348
1349 char process_name[11];
1350 process_name[0] = '(';
1351 memcpy(process_name+1, p, l);
1352 process_name[1+l] = ')';
1353 process_name[1+l+1] = 0;
1354
1355 (void) rename_process(process_name);
1356 }
1357
1358 static bool context_has_address_families(const ExecContext *c) {
1359 assert(c);
1360
1361 return c->address_families_allow_list ||
1362 !set_isempty(c->address_families);
1363 }
1364
1365 static bool context_has_syscall_filters(const ExecContext *c) {
1366 assert(c);
1367
1368 return c->syscall_allow_list ||
1369 !hashmap_isempty(c->syscall_filter);
1370 }
1371
1372 static bool context_has_syscall_logs(const ExecContext *c) {
1373 assert(c);
1374
1375 return c->syscall_log_allow_list ||
1376 !hashmap_isempty(c->syscall_log);
1377 }
1378
1379 static bool context_has_seccomp(const ExecContext *c) {
1380 /* We need NNP if we have any form of seccomp and are unprivileged */
1381 return c->lock_personality ||
1382 c->memory_deny_write_execute ||
1383 c->private_devices ||
1384 c->protect_clock ||
1385 c->protect_hostname ||
1386 c->protect_kernel_tunables ||
1387 c->protect_kernel_modules ||
1388 c->protect_kernel_logs ||
1389 context_has_address_families(c) ||
1390 exec_context_restrict_namespaces_set(c) ||
1391 c->restrict_realtime ||
1392 c->restrict_suid_sgid ||
1393 !set_isempty(c->syscall_archs) ||
1394 context_has_syscall_filters(c) ||
1395 context_has_syscall_logs(c);
1396 }
1397
1398 static bool context_has_no_new_privileges(const ExecContext *c) {
1399 assert(c);
1400
1401 if (c->no_new_privileges)
1402 return true;
1403
1404 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1405 return false;
1406
1407 return context_has_seccomp(c);
1408 }
1409
1410 #if HAVE_SECCOMP
1411
1412 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1413 void *id, *val;
1414 bool has_capget = false, has_capset = false, has_prctl = false;
1415
1416 assert(c);
1417
1418 /* No syscall filter, we are allowed to drop privileges */
1419 if (hashmap_isempty(c->syscall_filter))
1420 return true;
1421
1422 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1423 _cleanup_free_ char *name = NULL;
1424
1425 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1426
1427 if (streq(name, "capget"))
1428 has_capget = true;
1429 else if (streq(name, "capset"))
1430 has_capset = true;
1431 else if (streq(name, "prctl"))
1432 has_prctl = true;
1433 }
1434
1435 if (c->syscall_allow_list)
1436 return has_capget && has_capset && has_prctl;
1437 else
1438 return !(has_capget || has_capset || has_prctl);
1439 }
1440
1441 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1442
1443 if (is_seccomp_available())
1444 return false;
1445
1446 log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1447 return true;
1448 }
1449
1450 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1451 uint32_t negative_action, default_action, action;
1452 int r;
1453
1454 assert(c);
1455 assert(p);
1456
1457 if (!context_has_syscall_filters(c))
1458 return 0;
1459
1460 if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1461 return 0;
1462
1463 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1464
1465 if (c->syscall_allow_list) {
1466 default_action = negative_action;
1467 action = SCMP_ACT_ALLOW;
1468 } else {
1469 default_action = SCMP_ACT_ALLOW;
1470 action = negative_action;
1471 }
1472
1473 if (needs_ambient_hack) {
1474 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1475 if (r < 0)
1476 return r;
1477 }
1478
1479 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1480 }
1481
1482 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1483 #ifdef SCMP_ACT_LOG
1484 uint32_t default_action, action;
1485 #endif
1486
1487 assert(c);
1488 assert(p);
1489
1490 if (!context_has_syscall_logs(c))
1491 return 0;
1492
1493 #ifdef SCMP_ACT_LOG
1494 if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1495 return 0;
1496
1497 if (c->syscall_log_allow_list) {
1498 /* Log nothing but the ones listed */
1499 default_action = SCMP_ACT_ALLOW;
1500 action = SCMP_ACT_LOG;
1501 } else {
1502 /* Log everything but the ones listed */
1503 default_action = SCMP_ACT_LOG;
1504 action = SCMP_ACT_ALLOW;
1505 }
1506
1507 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1508 #else
1509 /* old libseccomp */
1510 log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1511 return 0;
1512 #endif
1513 }
1514
1515 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1516 assert(c);
1517 assert(p);
1518
1519 if (set_isempty(c->syscall_archs))
1520 return 0;
1521
1522 if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1523 return 0;
1524
1525 return seccomp_restrict_archs(c->syscall_archs);
1526 }
1527
1528 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1529 assert(c);
1530 assert(p);
1531
1532 if (!context_has_address_families(c))
1533 return 0;
1534
1535 if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1536 return 0;
1537
1538 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1539 }
1540
1541 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1542 int r;
1543
1544 assert(c);
1545 assert(p);
1546
1547 if (!c->memory_deny_write_execute)
1548 return 0;
1549
1550 /* use prctl() if kernel supports it (6.3) */
1551 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1552 if (r == 0) {
1553 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1554 return 0;
1555 }
1556 if (r < 0 && errno != EINVAL)
1557 return log_exec_debug_errno(c,
1558 p,
1559 errno,
1560 "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1561 /* else use seccomp */
1562 log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1563
1564 if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1565 return 0;
1566
1567 return seccomp_memory_deny_write_execute();
1568 }
1569
1570 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1571 assert(c);
1572 assert(p);
1573
1574 if (!c->restrict_realtime)
1575 return 0;
1576
1577 if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1578 return 0;
1579
1580 return seccomp_restrict_realtime();
1581 }
1582
1583 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1584 assert(c);
1585 assert(p);
1586
1587 if (!c->restrict_suid_sgid)
1588 return 0;
1589
1590 if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1591 return 0;
1592
1593 return seccomp_restrict_suid_sgid();
1594 }
1595
1596 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1597 assert(c);
1598 assert(p);
1599
1600 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1601 * let's protect even those systems where this is left on in the kernel. */
1602
1603 if (!c->protect_kernel_tunables)
1604 return 0;
1605
1606 if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1607 return 0;
1608
1609 return seccomp_protect_sysctl();
1610 }
1611
1612 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1613 assert(c);
1614 assert(p);
1615
1616 /* Turn off module syscalls on ProtectKernelModules=yes */
1617
1618 if (!c->protect_kernel_modules)
1619 return 0;
1620
1621 if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1622 return 0;
1623
1624 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1625 }
1626
1627 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1628 assert(c);
1629 assert(p);
1630
1631 if (!c->protect_kernel_logs)
1632 return 0;
1633
1634 if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1635 return 0;
1636
1637 return seccomp_protect_syslog();
1638 }
1639
1640 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1641 assert(c);
1642 assert(p);
1643
1644 if (!c->protect_clock)
1645 return 0;
1646
1647 if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1648 return 0;
1649
1650 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1651 }
1652
1653 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1654 assert(c);
1655 assert(p);
1656
1657 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1658
1659 if (!c->private_devices)
1660 return 0;
1661
1662 if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1663 return 0;
1664
1665 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1666 }
1667
1668 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1669 assert(c);
1670 assert(p);
1671
1672 if (!exec_context_restrict_namespaces_set(c))
1673 return 0;
1674
1675 if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1676 return 0;
1677
1678 return seccomp_restrict_namespaces(c->restrict_namespaces);
1679 }
1680
1681 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1682 unsigned long personality;
1683 int r;
1684
1685 assert(c);
1686 assert(p);
1687
1688 if (!c->lock_personality)
1689 return 0;
1690
1691 if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1692 return 0;
1693
1694 personality = c->personality;
1695
1696 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1697 if (personality == PERSONALITY_INVALID) {
1698
1699 r = opinionated_personality(&personality);
1700 if (r < 0)
1701 return r;
1702 }
1703
1704 return seccomp_lock_personality(personality);
1705 }
1706
1707 #endif
1708
1709 #if HAVE_LIBBPF
1710 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1711 int r;
1712
1713 assert(c);
1714 assert(p);
1715
1716 if (!exec_context_restrict_filesystems_set(c))
1717 return 0;
1718
1719 if (p->bpf_restrict_fs_map_fd < 0) {
1720 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1721 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1722 return 0;
1723 }
1724
1725 /* We are in a new binary, so dl-open again */
1726 r = dlopen_bpf();
1727 if (r < 0)
1728 return r;
1729
1730 return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
1731 }
1732 #endif
1733
1734 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1735 assert(c);
1736 assert(p);
1737
1738 if (!c->protect_hostname)
1739 return 0;
1740
1741 if (ns_type_supported(NAMESPACE_UTS)) {
1742 if (unshare(CLONE_NEWUTS) < 0) {
1743 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1744 *ret_exit_status = EXIT_NAMESPACE;
1745 return log_exec_error_errno(c,
1746 p,
1747 errno,
1748 "Failed to set up UTS namespacing: %m");
1749 }
1750
1751 log_exec_warning(c,
1752 p,
1753 "ProtectHostname=yes is configured, but UTS namespace setup is "
1754 "prohibited (container manager?), ignoring namespace setup.");
1755 }
1756 } else
1757 log_exec_warning(c,
1758 p,
1759 "ProtectHostname=yes is configured, but the kernel does not "
1760 "support UTS namespaces, ignoring namespace setup.");
1761
1762 #if HAVE_SECCOMP
1763 int r;
1764
1765 if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1766 return 0;
1767
1768 r = seccomp_protect_hostname();
1769 if (r < 0) {
1770 *ret_exit_status = EXIT_SECCOMP;
1771 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1772 }
1773 #endif
1774
1775 return 0;
1776 }
1777
1778 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1779 assert(idle_pipe);
1780
1781 idle_pipe[1] = safe_close(idle_pipe[1]);
1782 idle_pipe[2] = safe_close(idle_pipe[2]);
1783
1784 if (idle_pipe[0] >= 0) {
1785 int r;
1786
1787 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1788
1789 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1790 ssize_t n;
1791
1792 /* Signal systemd that we are bored and want to continue. */
1793 n = write(idle_pipe[3], "x", 1);
1794 if (n > 0)
1795 /* Wait for systemd to react to the signal above. */
1796 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1797 }
1798
1799 idle_pipe[0] = safe_close(idle_pipe[0]);
1800
1801 }
1802
1803 idle_pipe[3] = safe_close(idle_pipe[3]);
1804 }
1805
1806 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1807
1808 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1809 * the service payload in. */
1810 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1811 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1812 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1813 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1814 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1815 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1816 };
1817
1818 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1819
1820 static int build_environment(
1821 const ExecContext *c,
1822 const ExecParameters *p,
1823 const CGroupContext *cgroup_context,
1824 size_t n_fds,
1825 const char *home,
1826 const char *username,
1827 const char *shell,
1828 dev_t journal_stream_dev,
1829 ino_t journal_stream_ino,
1830 const char *memory_pressure_path,
1831 char ***ret) {
1832
1833 _cleanup_strv_free_ char **our_env = NULL;
1834 size_t n_env = 0;
1835 char *x;
1836 int r;
1837
1838 assert(c);
1839 assert(p);
1840 assert(ret);
1841
1842 #define N_ENV_VARS 19
1843 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1844 if (!our_env)
1845 return -ENOMEM;
1846
1847 if (n_fds > 0) {
1848 _cleanup_free_ char *joined = NULL;
1849
1850 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1851 return -ENOMEM;
1852 our_env[n_env++] = x;
1853
1854 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1855 return -ENOMEM;
1856 our_env[n_env++] = x;
1857
1858 joined = strv_join(p->fd_names, ":");
1859 if (!joined)
1860 return -ENOMEM;
1861
1862 x = strjoin("LISTEN_FDNAMES=", joined);
1863 if (!x)
1864 return -ENOMEM;
1865 our_env[n_env++] = x;
1866 }
1867
1868 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1869 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1870 return -ENOMEM;
1871 our_env[n_env++] = x;
1872
1873 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1874 return -ENOMEM;
1875 our_env[n_env++] = x;
1876 }
1877
1878 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1879 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1880 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1881 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1882 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1883 if (!x)
1884 return -ENOMEM;
1885 our_env[n_env++] = x;
1886 }
1887
1888 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1889 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1890 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1891 * SetLoginEnvironment= switch. */
1892 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1893 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1894 if (r < 0)
1895 return log_exec_debug_errno(c,
1896 p,
1897 r,
1898 "Failed to determine user credentials for root: %m");
1899 }
1900
1901 bool set_user_login_env = exec_context_get_set_login_environment(c);
1902
1903 if (username) {
1904 x = strjoin("USER=", username);
1905 if (!x)
1906 return -ENOMEM;
1907 our_env[n_env++] = x;
1908
1909 if (set_user_login_env) {
1910 x = strjoin("LOGNAME=", username);
1911 if (!x)
1912 return -ENOMEM;
1913 our_env[n_env++] = x;
1914 }
1915 }
1916
1917 if (home && set_user_login_env) {
1918 x = strjoin("HOME=", home);
1919 if (!x)
1920 return -ENOMEM;
1921
1922 path_simplify(x + 5);
1923 our_env[n_env++] = x;
1924 }
1925
1926 if (shell && set_user_login_env) {
1927 x = strjoin("SHELL=", shell);
1928 if (!x)
1929 return -ENOMEM;
1930
1931 path_simplify(x + 6);
1932 our_env[n_env++] = x;
1933 }
1934
1935 if (!sd_id128_is_null(p->invocation_id)) {
1936 assert(p->invocation_id_string);
1937
1938 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1939 if (!x)
1940 return -ENOMEM;
1941
1942 our_env[n_env++] = x;
1943 }
1944
1945 if (exec_context_needs_term(c)) {
1946 _cleanup_free_ char *cmdline = NULL;
1947 const char *tty_path, *term = NULL;
1948
1949 tty_path = exec_context_tty_path(c);
1950
1951 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1952 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1953 * container manager passes to PID 1 ends up all the way in the console login shown. */
1954
1955 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1956 term = getenv("TERM");
1957 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1958 _cleanup_free_ char *key = NULL;
1959
1960 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1961 if (!key)
1962 return -ENOMEM;
1963
1964 r = proc_cmdline_get_key(key, 0, &cmdline);
1965 if (r < 0)
1966 log_exec_debug_errno(c,
1967 p,
1968 r,
1969 "Failed to read %s from kernel cmdline, ignoring: %m",
1970 key);
1971 else if (r > 0)
1972 term = cmdline;
1973 }
1974
1975 if (!term)
1976 term = default_term_for_tty(tty_path);
1977
1978 x = strjoin("TERM=", term);
1979 if (!x)
1980 return -ENOMEM;
1981 our_env[n_env++] = x;
1982 }
1983
1984 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1985 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1986 return -ENOMEM;
1987
1988 our_env[n_env++] = x;
1989 }
1990
1991 if (c->log_namespace) {
1992 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1993 if (!x)
1994 return -ENOMEM;
1995
1996 our_env[n_env++] = x;
1997 }
1998
1999 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2000 _cleanup_free_ char *joined = NULL;
2001 const char *n;
2002
2003 if (!p->prefix[t])
2004 continue;
2005
2006 if (c->directories[t].n_items == 0)
2007 continue;
2008
2009 n = exec_directory_env_name_to_string(t);
2010 if (!n)
2011 continue;
2012
2013 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2014 _cleanup_free_ char *prefixed = NULL;
2015
2016 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2017 if (!prefixed)
2018 return -ENOMEM;
2019
2020 if (!strextend_with_separator(&joined, ":", prefixed))
2021 return -ENOMEM;
2022 }
2023
2024 x = strjoin(n, "=", joined);
2025 if (!x)
2026 return -ENOMEM;
2027
2028 our_env[n_env++] = x;
2029 }
2030
2031 _cleanup_free_ char *creds_dir = NULL;
2032 r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
2033 if (r < 0)
2034 return r;
2035 if (r > 0) {
2036 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2037 if (!x)
2038 return -ENOMEM;
2039
2040 our_env[n_env++] = x;
2041 }
2042
2043 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2044 return -ENOMEM;
2045
2046 our_env[n_env++] = x;
2047
2048 if (memory_pressure_path) {
2049 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2050 if (!x)
2051 return -ENOMEM;
2052
2053 our_env[n_env++] = x;
2054
2055 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2056 _cleanup_free_ char *b = NULL, *e = NULL;
2057
2058 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2059 MEMORY_PRESSURE_DEFAULT_TYPE,
2060 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2061 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2062 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2063 return -ENOMEM;
2064
2065 if (base64mem(b, strlen(b) + 1, &e) < 0)
2066 return -ENOMEM;
2067
2068 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2069 if (!x)
2070 return -ENOMEM;
2071
2072 our_env[n_env++] = x;
2073 }
2074 }
2075
2076 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2077 #undef N_ENV_VARS
2078
2079 *ret = TAKE_PTR(our_env);
2080
2081 return 0;
2082 }
2083
2084 static int build_pass_environment(const ExecContext *c, char ***ret) {
2085 _cleanup_strv_free_ char **pass_env = NULL;
2086 size_t n_env = 0;
2087
2088 STRV_FOREACH(i, c->pass_environment) {
2089 _cleanup_free_ char *x = NULL;
2090 char *v;
2091
2092 v = getenv(*i);
2093 if (!v)
2094 continue;
2095 x = strjoin(*i, "=", v);
2096 if (!x)
2097 return -ENOMEM;
2098
2099 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2100 return -ENOMEM;
2101
2102 pass_env[n_env++] = TAKE_PTR(x);
2103 pass_env[n_env] = NULL;
2104 }
2105
2106 *ret = TAKE_PTR(pass_env);
2107
2108 return 0;
2109 }
2110
2111 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2112 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2113 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2114 _cleanup_close_ int unshare_ready_fd = -EBADF;
2115 _cleanup_(sigkill_waitp) pid_t pid = 0;
2116 uint64_t c = 1;
2117 ssize_t n;
2118 int r;
2119
2120 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2121 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2122 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2123 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2124 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2125 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2126 * continues execution normally.
2127 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2128 * does not need CAP_SETUID to write the single line mapping to itself. */
2129
2130 /* Can only set up multiple mappings with CAP_SETUID. */
2131 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2132 r = asprintf(&uid_map,
2133 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2134 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2135 ouid, ouid, uid, uid);
2136 else
2137 r = asprintf(&uid_map,
2138 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2139 ouid, ouid);
2140
2141 if (r < 0)
2142 return -ENOMEM;
2143
2144 /* Can only set up multiple mappings with CAP_SETGID. */
2145 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2146 r = asprintf(&gid_map,
2147 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2148 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2149 ogid, ogid, gid, gid);
2150 else
2151 r = asprintf(&gid_map,
2152 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2153 ogid, ogid);
2154
2155 if (r < 0)
2156 return -ENOMEM;
2157
2158 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2159 * namespace. */
2160 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2161 if (unshare_ready_fd < 0)
2162 return -errno;
2163
2164 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2165 * failed. */
2166 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2167 return -errno;
2168
2169 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2170 if (r < 0)
2171 return r;
2172 if (r == 0) {
2173 _cleanup_close_ int fd = -EBADF;
2174 const char *a;
2175 pid_t ppid;
2176
2177 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2178 * here, after the parent opened its own user namespace. */
2179
2180 ppid = getppid();
2181 errno_pipe[0] = safe_close(errno_pipe[0]);
2182
2183 /* Wait until the parent unshared the user namespace */
2184 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2185 r = -errno;
2186 goto child_fail;
2187 }
2188
2189 /* Disable the setgroups() system call in the child user namespace, for good. */
2190 a = procfs_file_alloca(ppid, "setgroups");
2191 fd = open(a, O_WRONLY|O_CLOEXEC);
2192 if (fd < 0) {
2193 if (errno != ENOENT) {
2194 r = -errno;
2195 goto child_fail;
2196 }
2197
2198 /* If the file is missing the kernel is too old, let's continue anyway. */
2199 } else {
2200 if (write(fd, "deny\n", 5) < 0) {
2201 r = -errno;
2202 goto child_fail;
2203 }
2204
2205 fd = safe_close(fd);
2206 }
2207
2208 /* First write the GID map */
2209 a = procfs_file_alloca(ppid, "gid_map");
2210 fd = open(a, O_WRONLY|O_CLOEXEC);
2211 if (fd < 0) {
2212 r = -errno;
2213 goto child_fail;
2214 }
2215 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2216 r = -errno;
2217 goto child_fail;
2218 }
2219 fd = safe_close(fd);
2220
2221 /* The write the UID map */
2222 a = procfs_file_alloca(ppid, "uid_map");
2223 fd = open(a, O_WRONLY|O_CLOEXEC);
2224 if (fd < 0) {
2225 r = -errno;
2226 goto child_fail;
2227 }
2228 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2229 r = -errno;
2230 goto child_fail;
2231 }
2232
2233 _exit(EXIT_SUCCESS);
2234
2235 child_fail:
2236 (void) write(errno_pipe[1], &r, sizeof(r));
2237 _exit(EXIT_FAILURE);
2238 }
2239
2240 errno_pipe[1] = safe_close(errno_pipe[1]);
2241
2242 if (unshare(CLONE_NEWUSER) < 0)
2243 return -errno;
2244
2245 /* Let the child know that the namespace is ready now */
2246 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2247 return -errno;
2248
2249 /* Try to read an error code from the child */
2250 n = read(errno_pipe[0], &r, sizeof(r));
2251 if (n < 0)
2252 return -errno;
2253 if (n == sizeof(r)) { /* an error code was sent to us */
2254 if (r < 0)
2255 return r;
2256 return -EIO;
2257 }
2258 if (n != 0) /* on success we should have read 0 bytes */
2259 return -EIO;
2260
2261 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2262 if (r < 0)
2263 return r;
2264 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2265 return -EIO;
2266
2267 return 0;
2268 }
2269
2270 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2271 _cleanup_free_ char *src_abs = NULL;
2272 int r;
2273
2274 assert(source);
2275
2276 src_abs = path_join(root, source);
2277 if (!src_abs)
2278 return -ENOMEM;
2279
2280 STRV_FOREACH(dst, symlinks) {
2281 _cleanup_free_ char *dst_abs = NULL;
2282
2283 dst_abs = path_join(root, *dst);
2284 if (!dst_abs)
2285 return -ENOMEM;
2286
2287 r = mkdir_parents_label(dst_abs, 0755);
2288 if (r < 0)
2289 return r;
2290
2291 r = symlink_idempotent(src_abs, dst_abs, true);
2292 if (r < 0)
2293 return r;
2294 }
2295
2296 return 0;
2297 }
2298
2299 static int setup_exec_directory(
2300 const ExecContext *context,
2301 const ExecParameters *params,
2302 uid_t uid,
2303 gid_t gid,
2304 ExecDirectoryType type,
2305 bool needs_mount_namespace,
2306 int *exit_status) {
2307
2308 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2309 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2310 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2311 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2312 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2313 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2314 };
2315 int r;
2316
2317 assert(context);
2318 assert(params);
2319 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2320 assert(exit_status);
2321
2322 if (!params->prefix[type])
2323 return 0;
2324
2325 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2326 if (!uid_is_valid(uid))
2327 uid = 0;
2328 if (!gid_is_valid(gid))
2329 gid = 0;
2330 }
2331
2332 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2333 _cleanup_free_ char *p = NULL, *pp = NULL;
2334
2335 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2336 if (!p) {
2337 r = -ENOMEM;
2338 goto fail;
2339 }
2340
2341 r = mkdir_parents_label(p, 0755);
2342 if (r < 0)
2343 goto fail;
2344
2345 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2346
2347 /* If we are in user mode, and a configuration directory exists but a state directory
2348 * doesn't exist, then we likely are upgrading from an older systemd version that
2349 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2350 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2351 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2352 * separated. If a service has both dirs configured but only the configuration dir
2353 * exists and the state dir does not, we assume we are looking at an update
2354 * situation. Hence, create a compatibility symlink, so that all expectations are
2355 * met.
2356 *
2357 * (We also do something similar with the log directory, which still doesn't exist in
2358 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2359
2360 /* this assumes the state dir is always created before the configuration dir */
2361 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2362 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2363
2364 r = laccess(p, F_OK);
2365 if (r == -ENOENT) {
2366 _cleanup_free_ char *q = NULL;
2367
2368 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2369 * under the configuration hierarchy. */
2370
2371 if (type == EXEC_DIRECTORY_STATE)
2372 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2373 else if (type == EXEC_DIRECTORY_LOGS)
2374 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2375 else
2376 assert_not_reached();
2377 if (!q) {
2378 r = -ENOMEM;
2379 goto fail;
2380 }
2381
2382 r = laccess(q, F_OK);
2383 if (r >= 0) {
2384 /* It does exist! This hence looks like an update. Symlink the
2385 * configuration directory into the state directory. */
2386
2387 r = symlink_idempotent(q, p, /* make_relative= */ true);
2388 if (r < 0)
2389 goto fail;
2390
2391 log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2392 continue;
2393 } else if (r != -ENOENT)
2394 log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2395
2396 } else if (r < 0)
2397 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2398 }
2399
2400 if (exec_directory_is_private(context, type)) {
2401 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2402 * case we want to avoid leaving a directory around fully accessible that is owned by
2403 * a dynamic user whose UID is later on reused. To lock this down we use the same
2404 * trick used by container managers to prohibit host users to get access to files of
2405 * the same UID in containers: we place everything inside a directory that has an
2406 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2407 * for unprivileged host code. We then use fs namespacing to make this directory
2408 * permeable for the service itself.
2409 *
2410 * Specifically: for a service which wants a special directory "foo/" we first create
2411 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2412 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2413 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2414 * unprivileged host users can't look into it. Inside of the namespace of the unit
2415 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2416 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2417 * for the service and making sure it only gets access to the dirs it needs but no
2418 * others. Tricky? Yes, absolutely, but it works!
2419 *
2420 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2421 * to be owned by the service itself.
2422 *
2423 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2424 * for sharing files or sockets with other services. */
2425
2426 pp = path_join(params->prefix[type], "private");
2427 if (!pp) {
2428 r = -ENOMEM;
2429 goto fail;
2430 }
2431
2432 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2433 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2434 if (r < 0)
2435 goto fail;
2436
2437 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2438 r = -ENOMEM;
2439 goto fail;
2440 }
2441
2442 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2443 r = mkdir_parents_label(pp, 0755);
2444 if (r < 0)
2445 goto fail;
2446
2447 if (is_dir(p, false) > 0 &&
2448 (laccess(pp, F_OK) == -ENOENT)) {
2449
2450 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2451 * it over. Most likely the service has been upgraded from one that didn't use
2452 * DynamicUser=1, to one that does. */
2453
2454 log_exec_info(context,
2455 params,
2456 "Found pre-existing public %s= directory %s, migrating to %s.\n"
2457 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2458 exec_directory_type_to_string(type), p, pp);
2459
2460 r = RET_NERRNO(rename(p, pp));
2461 if (r < 0)
2462 goto fail;
2463 } else {
2464 /* Otherwise, create the actual directory for the service */
2465
2466 r = mkdir_label(pp, context->directories[type].mode);
2467 if (r < 0 && r != -EEXIST)
2468 goto fail;
2469 }
2470
2471 if (!context->directories[type].items[i].only_create) {
2472 /* And link it up from the original place.
2473 * Notes
2474 * 1) If a mount namespace is going to be used, then this symlink remains on
2475 * the host, and a new one for the child namespace will be created later.
2476 * 2) It is not necessary to create this symlink when one of its parent
2477 * directories is specified and already created. E.g.
2478 * StateDirectory=foo foo/bar
2479 * In that case, the inode points to pp and p for "foo/bar" are the same:
2480 * pp = "/var/lib/private/foo/bar"
2481 * p = "/var/lib/foo/bar"
2482 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2483 * we do not need to create the symlink, but we cannot create the symlink.
2484 * See issue #24783. */
2485 r = symlink_idempotent(pp, p, true);
2486 if (r < 0)
2487 goto fail;
2488 }
2489
2490 } else {
2491 _cleanup_free_ char *target = NULL;
2492
2493 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2494 readlink_and_make_absolute(p, &target) >= 0) {
2495 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2496
2497 /* This already exists and is a symlink? Interesting. Maybe it's one created
2498 * by DynamicUser=1 (see above)?
2499 *
2500 * We do this for all directory types except for ConfigurationDirectory=,
2501 * since they all support the private/ symlink logic at least in some
2502 * configurations, see above. */
2503
2504 r = chase(target, NULL, 0, &target_resolved, NULL);
2505 if (r < 0)
2506 goto fail;
2507
2508 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2509 if (!q) {
2510 r = -ENOMEM;
2511 goto fail;
2512 }
2513
2514 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2515 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2516 if (r < 0)
2517 goto fail;
2518
2519 if (path_equal(q_resolved, target_resolved)) {
2520
2521 /* Hmm, apparently DynamicUser= was once turned on for this service,
2522 * but is no longer. Let's move the directory back up. */
2523
2524 log_exec_info(context,
2525 params,
2526 "Found pre-existing private %s= directory %s, migrating to %s.\n"
2527 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2528 exec_directory_type_to_string(type), q, p);
2529
2530 r = RET_NERRNO(unlink(p));
2531 if (r < 0)
2532 goto fail;
2533
2534 r = RET_NERRNO(rename(q, p));
2535 if (r < 0)
2536 goto fail;
2537 }
2538 }
2539
2540 r = mkdir_label(p, context->directories[type].mode);
2541 if (r < 0) {
2542 if (r != -EEXIST)
2543 goto fail;
2544
2545 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2546 struct stat st;
2547
2548 /* Don't change the owner/access mode of the configuration directory,
2549 * as in the common case it is not written to by a service, and shall
2550 * not be writable. */
2551
2552 r = RET_NERRNO(stat(p, &st));
2553 if (r < 0)
2554 goto fail;
2555
2556 /* Still complain if the access mode doesn't match */
2557 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2558 log_exec_warning(context,
2559 params,
2560 "%s \'%s\' already exists but the mode is different. "
2561 "(File system: %o %sMode: %o)",
2562 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2563 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2564
2565 continue;
2566 }
2567 }
2568 }
2569
2570 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2571 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2572 * current UID/GID ownership.) */
2573 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2574 if (r < 0)
2575 goto fail;
2576
2577 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2578 * available to user code anyway */
2579 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2580 continue;
2581
2582 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2583 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2584 * assignments to exist. */
2585 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2586 if (r < 0)
2587 goto fail;
2588 }
2589
2590 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2591 * they are set up later, to allow configuring empty var/run/etc. */
2592 if (!needs_mount_namespace)
2593 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2594 r = create_many_symlinks(params->prefix[type],
2595 context->directories[type].items[i].path,
2596 context->directories[type].items[i].symlinks);
2597 if (r < 0)
2598 goto fail;
2599 }
2600
2601 return 0;
2602
2603 fail:
2604 *exit_status = exit_status_table[type];
2605 return r;
2606 }
2607
2608 #if ENABLE_SMACK
2609 static int setup_smack(
2610 const ExecParameters *params,
2611 const ExecContext *context,
2612 int executable_fd) {
2613 int r;
2614
2615 assert(params);
2616 assert(executable_fd >= 0);
2617
2618 if (context->smack_process_label) {
2619 r = mac_smack_apply_pid(0, context->smack_process_label);
2620 if (r < 0)
2621 return r;
2622 } else if (params->fallback_smack_process_label) {
2623 _cleanup_free_ char *exec_label = NULL;
2624
2625 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2626 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2627 return r;
2628
2629 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2630 if (r < 0)
2631 return r;
2632 }
2633
2634 return 0;
2635 }
2636 #endif
2637
2638 static int compile_bind_mounts(
2639 const ExecContext *context,
2640 const ExecParameters *params,
2641 BindMount **ret_bind_mounts,
2642 size_t *ret_n_bind_mounts,
2643 char ***ret_empty_directories) {
2644
2645 _cleanup_strv_free_ char **empty_directories = NULL;
2646 BindMount *bind_mounts = NULL;
2647 size_t n, h = 0;
2648 int r;
2649
2650 assert(context);
2651 assert(params);
2652 assert(ret_bind_mounts);
2653 assert(ret_n_bind_mounts);
2654 assert(ret_empty_directories);
2655
2656 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2657
2658 n = context->n_bind_mounts;
2659 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2660 if (!params->prefix[t])
2661 continue;
2662
2663 for (size_t i = 0; i < context->directories[t].n_items; i++)
2664 n += !context->directories[t].items[i].only_create;
2665 }
2666
2667 if (n <= 0) {
2668 *ret_bind_mounts = NULL;
2669 *ret_n_bind_mounts = 0;
2670 *ret_empty_directories = NULL;
2671 return 0;
2672 }
2673
2674 bind_mounts = new(BindMount, n);
2675 if (!bind_mounts)
2676 return -ENOMEM;
2677
2678 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2679 BindMount *item = context->bind_mounts + i;
2680 _cleanup_free_ char *s = NULL, *d = NULL;
2681
2682 s = strdup(item->source);
2683 if (!s)
2684 return -ENOMEM;
2685
2686 d = strdup(item->destination);
2687 if (!d)
2688 return -ENOMEM;
2689
2690 bind_mounts[h++] = (BindMount) {
2691 .source = TAKE_PTR(s),
2692 .destination = TAKE_PTR(d),
2693 .read_only = item->read_only,
2694 .recursive = item->recursive,
2695 .ignore_enoent = item->ignore_enoent,
2696 };
2697 }
2698
2699 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2700 if (!params->prefix[t])
2701 continue;
2702
2703 if (context->directories[t].n_items == 0)
2704 continue;
2705
2706 if (exec_directory_is_private(context, t) &&
2707 !exec_context_with_rootfs(context)) {
2708 char *private_root;
2709
2710 /* So this is for a dynamic user, and we need to make sure the process can access its own
2711 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2712 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2713
2714 private_root = path_join(params->prefix[t], "private");
2715 if (!private_root)
2716 return -ENOMEM;
2717
2718 r = strv_consume(&empty_directories, private_root);
2719 if (r < 0)
2720 return r;
2721 }
2722
2723 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2724 _cleanup_free_ char *s = NULL, *d = NULL;
2725
2726 /* When one of the parent directories is in the list, we cannot create the symlink
2727 * for the child directory. See also the comments in setup_exec_directory(). */
2728 if (context->directories[t].items[i].only_create)
2729 continue;
2730
2731 if (exec_directory_is_private(context, t))
2732 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2733 else
2734 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2735 if (!s)
2736 return -ENOMEM;
2737
2738 if (exec_directory_is_private(context, t) &&
2739 exec_context_with_rootfs(context))
2740 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2741 * directory is not created on the root directory. So, let's bind-mount the directory
2742 * on the 'non-private' place. */
2743 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2744 else
2745 d = strdup(s);
2746 if (!d)
2747 return -ENOMEM;
2748
2749 bind_mounts[h++] = (BindMount) {
2750 .source = TAKE_PTR(s),
2751 .destination = TAKE_PTR(d),
2752 .read_only = false,
2753 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2754 .recursive = true,
2755 .ignore_enoent = false,
2756 };
2757 }
2758 }
2759
2760 assert(h == n);
2761
2762 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2763 *ret_n_bind_mounts = n;
2764 *ret_empty_directories = TAKE_PTR(empty_directories);
2765
2766 return (int) n;
2767 }
2768
2769 /* ret_symlinks will contain a list of pairs src:dest that describes
2770 * the symlinks to create later on. For example, the symlinks needed
2771 * to safely give private directories to DynamicUser=1 users. */
2772 static int compile_symlinks(
2773 const ExecContext *context,
2774 const ExecParameters *params,
2775 bool setup_os_release_symlink,
2776 char ***ret_symlinks) {
2777
2778 _cleanup_strv_free_ char **symlinks = NULL;
2779 int r;
2780
2781 assert(context);
2782 assert(params);
2783 assert(ret_symlinks);
2784
2785 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2786 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2787 _cleanup_free_ char *private_path = NULL, *path = NULL;
2788
2789 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2790 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2791
2792 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2793 dst_abs = path_join(params->prefix[dt], *symlink);
2794 if (!src_abs || !dst_abs)
2795 return -ENOMEM;
2796
2797 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2798 if (r < 0)
2799 return r;
2800 }
2801
2802 if (!exec_directory_is_private(context, dt) ||
2803 exec_context_with_rootfs(context) ||
2804 context->directories[dt].items[i].only_create)
2805 continue;
2806
2807 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2808 if (!private_path)
2809 return -ENOMEM;
2810
2811 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2812 if (!path)
2813 return -ENOMEM;
2814
2815 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2816 if (r < 0)
2817 return r;
2818 }
2819 }
2820
2821 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2822 * and readers will never get a half-written version. Note that, while the paths specified here are
2823 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2824 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2825 if (setup_os_release_symlink) {
2826 r = strv_extend_many(
2827 &symlinks,
2828 "/run/host/.os-release-stage/os-release",
2829 "/run/host/os-release");
2830 if (r < 0)
2831 return r;
2832 }
2833
2834 *ret_symlinks = TAKE_PTR(symlinks);
2835
2836 return 0;
2837 }
2838
2839 static bool insist_on_sandboxing(
2840 const ExecContext *context,
2841 const char *root_dir,
2842 const char *root_image,
2843 const BindMount *bind_mounts,
2844 size_t n_bind_mounts) {
2845
2846 assert(context);
2847 assert(n_bind_mounts == 0 || bind_mounts);
2848
2849 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2850 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2851 * rearrange stuff in a way we cannot ignore gracefully. */
2852
2853 if (context->n_temporary_filesystems > 0)
2854 return true;
2855
2856 if (root_dir || root_image)
2857 return true;
2858
2859 if (context->n_mount_images > 0)
2860 return true;
2861
2862 if (context->dynamic_user)
2863 return true;
2864
2865 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2866 return true;
2867
2868 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2869 * essential. */
2870 for (size_t i = 0; i < n_bind_mounts; i++)
2871 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2872 return true;
2873
2874 if (context->log_namespace)
2875 return true;
2876
2877 return false;
2878 }
2879
2880 static int setup_ephemeral(
2881 const ExecContext *context,
2882 ExecRuntime *runtime,
2883 char **root_image, /* both input and output! modified if ephemeral logic enabled */
2884 char **root_directory) { /* ditto */
2885
2886 _cleanup_close_ int fd = -EBADF;
2887 _cleanup_free_ char *new_root = NULL;
2888 int r;
2889
2890 assert(context);
2891 assert(root_image);
2892 assert(root_directory);
2893
2894 if (!*root_image && !*root_directory)
2895 return 0;
2896
2897 if (!runtime || !runtime->ephemeral_copy)
2898 return 0;
2899
2900 assert(runtime->ephemeral_storage_socket[0] >= 0);
2901 assert(runtime->ephemeral_storage_socket[1] >= 0);
2902
2903 new_root = strdup(runtime->ephemeral_copy);
2904 if (!new_root)
2905 return log_oom_debug();
2906
2907 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2908 if (r < 0)
2909 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2910
2911 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2912
2913 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2914 if (fd >= 0)
2915 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2916 return 0;
2917 if (fd != -EAGAIN)
2918 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2919
2920 if (*root_image) {
2921 log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
2922
2923 fd = copy_file(*root_image,
2924 new_root,
2925 O_EXCL,
2926 0600,
2927 COPY_LOCK_BSD|
2928 COPY_REFLINK|
2929 COPY_CRTIME);
2930 if (fd < 0)
2931 return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
2932 *root_image, new_root);
2933
2934 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2935 * which tends to not perform well in combination with lots of random writes.
2936 *
2937 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2938 * copy, but we at least want to make the intention clear.
2939 */
2940 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2941 if (r < 0)
2942 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", new_root);
2943 } else {
2944 assert(*root_directory);
2945
2946 log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
2947
2948 fd = btrfs_subvol_snapshot_at(
2949 AT_FDCWD, *root_directory,
2950 AT_FDCWD, new_root,
2951 BTRFS_SNAPSHOT_FALLBACK_COPY |
2952 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2953 BTRFS_SNAPSHOT_RECURSIVE |
2954 BTRFS_SNAPSHOT_LOCK_BSD);
2955 if (fd < 0)
2956 return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
2957 *root_directory, new_root);
2958 }
2959
2960 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2961 if (r < 0)
2962 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2963
2964 if (*root_image)
2965 free_and_replace(*root_image, new_root);
2966 else {
2967 assert(*root_directory);
2968 free_and_replace(*root_directory, new_root);
2969 }
2970
2971 return 1;
2972 }
2973
2974 static int verity_settings_prepare(
2975 VeritySettings *verity,
2976 const char *root_image,
2977 const void *root_hash,
2978 size_t root_hash_size,
2979 const char *root_hash_path,
2980 const void *root_hash_sig,
2981 size_t root_hash_sig_size,
2982 const char *root_hash_sig_path,
2983 const char *verity_data_path) {
2984
2985 int r;
2986
2987 assert(verity);
2988
2989 if (root_hash) {
2990 void *d;
2991
2992 d = memdup(root_hash, root_hash_size);
2993 if (!d)
2994 return -ENOMEM;
2995
2996 free_and_replace(verity->root_hash, d);
2997 verity->root_hash_size = root_hash_size;
2998 verity->designator = PARTITION_ROOT;
2999 }
3000
3001 if (root_hash_sig) {
3002 void *d;
3003
3004 d = memdup(root_hash_sig, root_hash_sig_size);
3005 if (!d)
3006 return -ENOMEM;
3007
3008 free_and_replace(verity->root_hash_sig, d);
3009 verity->root_hash_sig_size = root_hash_sig_size;
3010 verity->designator = PARTITION_ROOT;
3011 }
3012
3013 if (verity_data_path) {
3014 r = free_and_strdup(&verity->data_path, verity_data_path);
3015 if (r < 0)
3016 return r;
3017 }
3018
3019 r = verity_settings_load(
3020 verity,
3021 root_image,
3022 root_hash_path,
3023 root_hash_sig_path);
3024 if (r < 0)
3025 return log_debug_errno(r, "Failed to load root hash: %m");
3026
3027 return 0;
3028 }
3029
3030 static int pick_versions(
3031 const ExecContext *context,
3032 const ExecParameters *params,
3033 char **ret_root_image,
3034 char **ret_root_directory) {
3035
3036 int r;
3037
3038 assert(context);
3039 assert(params);
3040 assert(ret_root_image);
3041 assert(ret_root_directory);
3042
3043 if (context->root_image) {
3044 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3045
3046 r = path_pick(/* toplevel_path= */ NULL,
3047 /* toplevel_fd= */ AT_FDCWD,
3048 context->root_image,
3049 &pick_filter_image_raw,
3050 PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3051 &result);
3052 if (r < 0)
3053 return r;
3054
3055 if (!result.path)
3056 return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
3057
3058 *ret_root_image = TAKE_PTR(result.path);
3059 *ret_root_directory = NULL;
3060 return r;
3061 }
3062
3063 if (context->root_directory) {
3064 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3065
3066 r = path_pick(/* toplevel_path= */ NULL,
3067 /* toplevel_fd= */ AT_FDCWD,
3068 context->root_directory,
3069 &pick_filter_image_dir,
3070 PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3071 &result);
3072 if (r < 0)
3073 return r;
3074
3075 if (!result.path)
3076 return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
3077
3078 *ret_root_image = NULL;
3079 *ret_root_directory = TAKE_PTR(result.path);
3080 return r;
3081 }
3082
3083 *ret_root_image = *ret_root_directory = NULL;
3084 return 0;
3085 }
3086
3087 static int apply_mount_namespace(
3088 ExecCommandFlags command_flags,
3089 const ExecContext *context,
3090 const ExecParameters *params,
3091 ExecRuntime *runtime,
3092 const char *memory_pressure_path,
3093 bool needs_sandboxing,
3094 char **error_path) {
3095
3096 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3097 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3098 **read_write_paths_cleanup = NULL;
3099 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3100 *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
3101 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3102 char **read_write_paths;
3103 bool setup_os_release_symlink;
3104 BindMount *bind_mounts = NULL;
3105 size_t n_bind_mounts = 0;
3106 int r;
3107
3108 assert(context);
3109
3110 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3111
3112 if (params->flags & EXEC_APPLY_CHROOT) {
3113 r = pick_versions(
3114 context,
3115 params,
3116 &root_image,
3117 &root_dir);
3118 if (r < 0)
3119 return r;
3120
3121 r = setup_ephemeral(
3122 context,
3123 runtime,
3124 &root_image,
3125 &root_dir);
3126 if (r < 0)
3127 return r;
3128 }
3129
3130 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3131 if (r < 0)
3132 return r;
3133
3134 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3135 * service will need to write to it in order to start the notifications. */
3136 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3137 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3138 if (!read_write_paths_cleanup)
3139 return -ENOMEM;
3140
3141 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3142 if (r < 0)
3143 return r;
3144
3145 read_write_paths = read_write_paths_cleanup;
3146 } else
3147 read_write_paths = context->read_write_paths;
3148
3149 if (needs_sandboxing) {
3150 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3151 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3152 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3153
3154 if (context->private_tmp && runtime && runtime->shared) {
3155 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3156 tmp_dir = runtime->shared->tmp_dir;
3157 else if (runtime->shared->tmp_dir)
3158 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3159
3160 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3161 var_tmp_dir = runtime->shared->var_tmp_dir;
3162 else if (runtime->shared->var_tmp_dir)
3163 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3164 }
3165 }
3166
3167 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3168 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3169 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3170 if (r < 0)
3171 return r;
3172
3173 if (context->mount_propagation_flag == MS_SHARED)
3174 log_exec_debug(context,
3175 params,
3176 "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3177
3178 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3179 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3180 if (r < 0)
3181 return r;
3182 }
3183
3184 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3185 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3186 if (!propagate_dir)
3187 return -ENOMEM;
3188
3189 incoming_dir = strdup("/run/systemd/incoming");
3190 if (!incoming_dir)
3191 return -ENOMEM;
3192
3193 extension_dir = strdup("/run/systemd/unit-extensions");
3194 if (!extension_dir)
3195 return -ENOMEM;
3196
3197 /* If running under a different root filesystem, propagate the host's os-release. We make a
3198 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3199 if (setup_os_release_symlink) {
3200 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3201 if (!host_os_release_stage)
3202 return -ENOMEM;
3203 }
3204 } else {
3205 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3206
3207 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3208 return -ENOMEM;
3209
3210 if (setup_os_release_symlink) {
3211 if (asprintf(&host_os_release_stage,
3212 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3213 geteuid()) < 0)
3214 return -ENOMEM;
3215 }
3216 }
3217
3218 if (root_image) {
3219 r = verity_settings_prepare(
3220 &verity,
3221 root_image,
3222 context->root_hash, context->root_hash_size, context->root_hash_path,
3223 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3224 context->root_verity);
3225 if (r < 0)
3226 return r;
3227 }
3228
3229 NamespaceParameters parameters = {
3230 .runtime_scope = params->runtime_scope,
3231
3232 .root_directory = root_dir,
3233 .root_image = root_image,
3234 .root_image_options = context->root_image_options,
3235 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3236
3237 .read_write_paths = read_write_paths,
3238 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3239 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3240
3241 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3242 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3243
3244 .empty_directories = empty_directories,
3245 .symlinks = symlinks,
3246
3247 .bind_mounts = bind_mounts,
3248 .n_bind_mounts = n_bind_mounts,
3249
3250 .temporary_filesystems = context->temporary_filesystems,
3251 .n_temporary_filesystems = context->n_temporary_filesystems,
3252
3253 .mount_images = context->mount_images,
3254 .n_mount_images = context->n_mount_images,
3255 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3256
3257 .tmp_dir = tmp_dir,
3258 .var_tmp_dir = var_tmp_dir,
3259
3260 .creds_path = creds_path,
3261 .log_namespace = context->log_namespace,
3262 .mount_propagation_flag = context->mount_propagation_flag,
3263
3264 .verity = &verity,
3265
3266 .extension_images = context->extension_images,
3267 .n_extension_images = context->n_extension_images,
3268 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3269 .extension_directories = context->extension_directories,
3270
3271 .propagate_dir = propagate_dir,
3272 .incoming_dir = incoming_dir,
3273 .extension_dir = extension_dir,
3274 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3275 .host_os_release_stage = host_os_release_stage,
3276
3277 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3278 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3279 * sandbox inside the mount namespace. */
3280 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3281
3282 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3283 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3284 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3285 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3286 .protect_hostname = needs_sandboxing && context->protect_hostname,
3287
3288 .private_dev = needs_sandboxing && context->private_devices,
3289 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3290 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3291
3292 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3293
3294 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3295 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3296
3297 .protect_home = needs_sandboxing ? context->protect_home : false,
3298 .protect_system = needs_sandboxing ? context->protect_system : false,
3299 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3300 .proc_subset = needs_sandboxing ? context->proc_subset : false,
3301 };
3302
3303 r = setup_namespace(&parameters, error_path);
3304 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3305 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3306 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3307 * completely different execution environment. */
3308 if (r == -ENOANO) {
3309 if (insist_on_sandboxing(
3310 context,
3311 root_dir, root_image,
3312 bind_mounts,
3313 n_bind_mounts))
3314 return log_exec_debug_errno(context,
3315 params,
3316 SYNTHETIC_ERRNO(EOPNOTSUPP),
3317 "Failed to set up namespace, and refusing to continue since "
3318 "the selected namespacing options alter mount environment non-trivially.\n"
3319 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3320 n_bind_mounts,
3321 context->n_temporary_filesystems,
3322 yes_no(root_dir),
3323 yes_no(root_image),
3324 yes_no(context->dynamic_user));
3325
3326 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3327 return 0;
3328 }
3329
3330 return r;
3331 }
3332
3333 static int apply_working_directory(
3334 const ExecContext *context,
3335 const ExecParameters *params,
3336 ExecRuntime *runtime,
3337 const char *home,
3338 int *exit_status) {
3339
3340 const char *wd;
3341 int r;
3342
3343 assert(context);
3344 assert(exit_status);
3345
3346 if (context->working_directory_home) {
3347 if (!home) {
3348 *exit_status = EXIT_CHDIR;
3349 return -ENXIO;
3350 }
3351
3352 wd = home;
3353 } else
3354 wd = empty_to_root(context->working_directory);
3355
3356 if (params->flags & EXEC_APPLY_CHROOT)
3357 r = RET_NERRNO(chdir(wd));
3358 else {
3359 _cleanup_close_ int dfd = -EBADF;
3360
3361 r = chase(wd,
3362 (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory,
3363 CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3364 /* ret_path= */ NULL,
3365 &dfd);
3366 if (r >= 0)
3367 r = RET_NERRNO(fchdir(dfd));
3368 }
3369
3370 if (r < 0 && !context->working_directory_missing_ok) {
3371 *exit_status = EXIT_CHDIR;
3372 return r;
3373 }
3374
3375 return 0;
3376 }
3377
3378 static int apply_root_directory(
3379 const ExecContext *context,
3380 const ExecParameters *params,
3381 ExecRuntime *runtime,
3382 const bool needs_mount_ns,
3383 int *exit_status) {
3384
3385 assert(context);
3386 assert(exit_status);
3387
3388 if (params->flags & EXEC_APPLY_CHROOT)
3389 if (!needs_mount_ns && context->root_directory)
3390 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3391 *exit_status = EXIT_CHROOT;
3392 return -errno;
3393 }
3394
3395 return 0;
3396 }
3397
3398 static int setup_keyring(
3399 const ExecContext *context,
3400 const ExecParameters *p,
3401 uid_t uid, gid_t gid) {
3402
3403 key_serial_t keyring;
3404 int r = 0;
3405 uid_t saved_uid;
3406 gid_t saved_gid;
3407
3408 assert(context);
3409 assert(p);
3410
3411 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3412 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3413 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3414 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3415 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3416 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3417
3418 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3419 return 0;
3420
3421 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3422 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3423 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3424 * & group is just as nasty as acquiring a reference to the user keyring. */
3425
3426 saved_uid = getuid();
3427 saved_gid = getgid();
3428
3429 if (gid_is_valid(gid) && gid != saved_gid) {
3430 if (setregid(gid, -1) < 0)
3431 return log_exec_error_errno(context,
3432 p,
3433 errno,
3434 "Failed to change GID for user keyring: %m");
3435 }
3436
3437 if (uid_is_valid(uid) && uid != saved_uid) {
3438 if (setreuid(uid, -1) < 0) {
3439 r = log_exec_error_errno(context,
3440 p,
3441 errno,
3442 "Failed to change UID for user keyring: %m");
3443 goto out;
3444 }
3445 }
3446
3447 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3448 if (keyring == -1) {
3449 if (errno == ENOSYS)
3450 log_exec_debug_errno(context,
3451 p,
3452 errno,
3453 "Kernel keyring not supported, ignoring.");
3454 else if (ERRNO_IS_PRIVILEGE(errno))
3455 log_exec_debug_errno(context,
3456 p,
3457 errno,
3458 "Kernel keyring access prohibited, ignoring.");
3459 else if (errno == EDQUOT)
3460 log_exec_debug_errno(context,
3461 p,
3462 errno,
3463 "Out of kernel keyrings to allocate, ignoring.");
3464 else
3465 r = log_exec_error_errno(context,
3466 p,
3467 errno,
3468 "Setting up kernel keyring failed: %m");
3469
3470 goto out;
3471 }
3472
3473 /* When requested link the user keyring into the session keyring. */
3474 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3475
3476 if (keyctl(KEYCTL_LINK,
3477 KEY_SPEC_USER_KEYRING,
3478 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3479 r = log_exec_error_errno(context,
3480 p,
3481 errno,
3482 "Failed to link user keyring into session keyring: %m");
3483 goto out;
3484 }
3485 }
3486
3487 /* Restore uid/gid back */
3488 if (uid_is_valid(uid) && uid != saved_uid) {
3489 if (setreuid(saved_uid, -1) < 0) {
3490 r = log_exec_error_errno(context,
3491 p,
3492 errno,
3493 "Failed to change UID back for user keyring: %m");
3494 goto out;
3495 }
3496 }
3497
3498 if (gid_is_valid(gid) && gid != saved_gid) {
3499 if (setregid(saved_gid, -1) < 0)
3500 return log_exec_error_errno(context,
3501 p,
3502 errno,
3503 "Failed to change GID back for user keyring: %m");
3504 }
3505
3506 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3507 if (!sd_id128_is_null(p->invocation_id)) {
3508 key_serial_t key;
3509
3510 key = add_key("user",
3511 "invocation_id",
3512 &p->invocation_id,
3513 sizeof(p->invocation_id),
3514 KEY_SPEC_SESSION_KEYRING);
3515 if (key == -1)
3516 log_exec_debug_errno(context,
3517 p,
3518 errno,
3519 "Failed to add invocation ID to keyring, ignoring: %m");
3520 else {
3521 if (keyctl(KEYCTL_SETPERM, key,
3522 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3523 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3524 r = log_exec_error_errno(context,
3525 p,
3526 errno,
3527 "Failed to restrict invocation ID permission: %m");
3528 }
3529 }
3530
3531 out:
3532 /* Revert back uid & gid for the last time, and exit */
3533 /* no extra logging, as only the first already reported error matters */
3534 if (getuid() != saved_uid)
3535 (void) setreuid(saved_uid, -1);
3536
3537 if (getgid() != saved_gid)
3538 (void) setregid(saved_gid, -1);
3539
3540 return r;
3541 }
3542
3543 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3544 assert(array);
3545 assert(n);
3546 assert(pair);
3547
3548 if (pair[0] >= 0)
3549 array[(*n)++] = pair[0];
3550 if (pair[1] >= 0)
3551 array[(*n)++] = pair[1];
3552 }
3553
3554 static int close_remaining_fds(
3555 const ExecParameters *params,
3556 const ExecRuntime *runtime,
3557 int socket_fd,
3558 const int *fds, size_t n_fds) {
3559
3560 size_t n_dont_close = 0;
3561 int dont_close[n_fds + 14];
3562
3563 assert(params);
3564
3565 if (params->stdin_fd >= 0)
3566 dont_close[n_dont_close++] = params->stdin_fd;
3567 if (params->stdout_fd >= 0)
3568 dont_close[n_dont_close++] = params->stdout_fd;
3569 if (params->stderr_fd >= 0)
3570 dont_close[n_dont_close++] = params->stderr_fd;
3571
3572 if (socket_fd >= 0)
3573 dont_close[n_dont_close++] = socket_fd;
3574 if (n_fds > 0) {
3575 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3576 n_dont_close += n_fds;
3577 }
3578
3579 if (runtime)
3580 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3581
3582 if (runtime && runtime->shared) {
3583 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3584 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3585 }
3586
3587 if (runtime && runtime->dynamic_creds) {
3588 if (runtime->dynamic_creds->user)
3589 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3590 if (runtime->dynamic_creds->group)
3591 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3592 }
3593
3594 if (params->user_lookup_fd >= 0)
3595 dont_close[n_dont_close++] = params->user_lookup_fd;
3596
3597 return close_all_fds(dont_close, n_dont_close);
3598 }
3599
3600 static int send_user_lookup(
3601 const char *unit_id,
3602 int user_lookup_fd,
3603 uid_t uid,
3604 gid_t gid) {
3605
3606 assert(unit_id);
3607
3608 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3609 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3610 * specified. */
3611
3612 if (user_lookup_fd < 0)
3613 return 0;
3614
3615 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3616 return 0;
3617
3618 if (writev(user_lookup_fd,
3619 (struct iovec[]) {
3620 IOVEC_MAKE(&uid, sizeof(uid)),
3621 IOVEC_MAKE(&gid, sizeof(gid)),
3622 IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3623 return -errno;
3624
3625 return 0;
3626 }
3627
3628 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3629 int r;
3630
3631 assert(c);
3632 assert(home);
3633 assert(buf);
3634
3635 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3636
3637 if (*home)
3638 return 0;
3639
3640 if (!c->working_directory_home)
3641 return 0;
3642
3643 r = get_home_dir(buf);
3644 if (r < 0)
3645 return r;
3646
3647 *home = *buf;
3648 return 1;
3649 }
3650
3651 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3652 _cleanup_strv_free_ char ** list = NULL;
3653 int r;
3654
3655 assert(c);
3656 assert(p);
3657 assert(ret);
3658
3659 assert(c->dynamic_user);
3660
3661 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3662 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3663 * directories. */
3664
3665 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3666 if (t == EXEC_DIRECTORY_CONFIGURATION)
3667 continue;
3668
3669 if (!p->prefix[t])
3670 continue;
3671
3672 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3673 char *e;
3674
3675 if (exec_directory_is_private(c, t))
3676 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3677 else
3678 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3679 if (!e)
3680 return -ENOMEM;
3681
3682 r = strv_consume(&list, e);
3683 if (r < 0)
3684 return r;
3685 }
3686 }
3687
3688 *ret = TAKE_PTR(list);
3689
3690 return 0;
3691 }
3692
3693 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3694 _cleanup_(cpu_set_reset) CPUSet s = {};
3695 int r;
3696
3697 assert(c);
3698 assert(ret);
3699
3700 if (!c->numa_policy.nodes.set) {
3701 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3702 return 0;
3703 }
3704
3705 r = numa_to_cpu_set(&c->numa_policy, &s);
3706 if (r < 0)
3707 return r;
3708
3709 cpu_set_reset(ret);
3710
3711 return cpu_set_add_all(ret, &s);
3712 }
3713
3714 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
3715 int r;
3716
3717 assert(fds);
3718 assert(n_fds);
3719 assert(*n_fds < fds_size);
3720 assert(fd);
3721
3722 if (*fd < 0)
3723 return 0;
3724
3725 if (*fd < 3 + (int) *n_fds) {
3726 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3727 * the fds we pass to the process (or which are closed only during execve). */
3728
3729 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3730 if (r < 0)
3731 return -errno;
3732
3733 close_and_replace(*fd, r);
3734 }
3735
3736 fds[(*n_fds)++] = *fd;
3737 return 1;
3738 }
3739
3740 static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3741 union sockaddr_union addr = {
3742 .un.sun_family = AF_UNIX,
3743 };
3744 socklen_t sa_len;
3745 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3746 int r;
3747
3748 assert(c);
3749 assert(p);
3750 assert(of);
3751 assert(ofd >= 0);
3752
3753 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3754 if (r < 0)
3755 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3756
3757 sa_len = r;
3758
3759 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3760 _cleanup_close_ int fd = -EBADF;
3761
3762 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3763 if (fd < 0)
3764 return log_exec_error_errno(c,
3765 p,
3766 errno,
3767 "Failed to create socket for %s: %m",
3768 of->path);
3769
3770 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3771 if (r == -EPROTOTYPE)
3772 continue;
3773 if (r < 0)
3774 return log_exec_error_errno(c,
3775 p,
3776 r,
3777 "Failed to connect socket for %s: %m",
3778 of->path);
3779
3780 return TAKE_FD(fd);
3781 }
3782
3783 return log_exec_error_errno(c,
3784 p,
3785 SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3786 of->path);
3787 }
3788
3789 static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3790 struct stat st;
3791 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3792
3793 assert(c);
3794 assert(p);
3795 assert(of);
3796
3797 ofd = open(of->path, O_PATH | O_CLOEXEC);
3798 if (ofd < 0)
3799 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3800
3801 if (fstat(ofd, &st) < 0)
3802 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3803
3804 if (S_ISSOCK(st.st_mode)) {
3805 fd = connect_unix_harder(c, p, of, ofd);
3806 if (fd < 0)
3807 return fd;
3808
3809 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3810 return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3811 of->path);
3812
3813 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3814 } else {
3815 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3816 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3817 flags |= O_APPEND;
3818 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3819 flags |= O_TRUNC;
3820
3821 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3822 if (fd < 0)
3823 return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3824
3825 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3826 }
3827
3828 return TAKE_FD(fd);
3829 }
3830
3831 static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
3832 int r;
3833
3834 assert(c);
3835 assert(p);
3836 assert(n_fds);
3837
3838 LIST_FOREACH(open_files, of, p->open_files) {
3839 _cleanup_close_ int fd = -EBADF;
3840
3841 fd = get_open_file_fd(c, p, of);
3842 if (fd < 0) {
3843 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3844 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3845 continue;
3846 }
3847
3848 return fd;
3849 }
3850
3851 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3852 return -ENOMEM;
3853
3854 r = strv_extend(&p->fd_names, of->fdname);
3855 if (r < 0)
3856 return r;
3857
3858 p->fds[*n_fds] = TAKE_FD(fd);
3859
3860 (*n_fds)++;
3861 }
3862
3863 return 0;
3864 }
3865
3866 static void log_command_line(
3867 const ExecContext *context,
3868 const ExecParameters *params,
3869 const char *msg,
3870 const char *executable,
3871 char **argv) {
3872
3873 assert(context);
3874 assert(params);
3875 assert(msg);
3876 assert(executable);
3877
3878 if (!DEBUG_LOGGING)
3879 return;
3880
3881 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3882
3883 log_exec_struct(context, params, LOG_DEBUG,
3884 "EXECUTABLE=%s", executable,
3885 LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3886 LOG_EXEC_INVOCATION_ID(params));
3887 }
3888
3889 static bool exec_context_need_unprivileged_private_users(
3890 const ExecContext *context,
3891 const ExecParameters *params) {
3892
3893 assert(context);
3894 assert(params);
3895
3896 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3897 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3898 * (system manager) then we have privileges and don't need this. */
3899 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3900 return false;
3901
3902 return context->private_users ||
3903 context->private_tmp ||
3904 context->private_devices ||
3905 context->private_network ||
3906 context->network_namespace_path ||
3907 context->private_ipc ||
3908 context->ipc_namespace_path ||
3909 context->private_mounts > 0 ||
3910 context->mount_apivfs ||
3911 context->n_bind_mounts > 0 ||
3912 context->n_temporary_filesystems > 0 ||
3913 context->root_directory ||
3914 !strv_isempty(context->extension_directories) ||
3915 context->protect_system != PROTECT_SYSTEM_NO ||
3916 context->protect_home != PROTECT_HOME_NO ||
3917 context->protect_kernel_tunables ||
3918 context->protect_kernel_modules ||
3919 context->protect_kernel_logs ||
3920 context->protect_control_groups ||
3921 context->protect_clock ||
3922 context->protect_hostname ||
3923 !strv_isempty(context->read_write_paths) ||
3924 !strv_isempty(context->read_only_paths) ||
3925 !strv_isempty(context->inaccessible_paths) ||
3926 !strv_isempty(context->exec_paths) ||
3927 !strv_isempty(context->no_exec_paths);
3928 }
3929
3930 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3931 assert(context);
3932
3933 if (confirm_spawn_disabled())
3934 return false;
3935
3936 /* For some reasons units remaining in the same process group
3937 * as PID 1 fail to acquire the console even if it's not used
3938 * by any process. So skip the confirmation question for them. */
3939 return !context->same_pgrp;
3940 }
3941
3942 static int exec_context_named_iofds(
3943 const ExecContext *c,
3944 const ExecParameters *p,
3945 int named_iofds[static 3]) {
3946
3947 size_t targets;
3948 const char* stdio_fdname[3];
3949 size_t n_fds;
3950
3951 assert(c);
3952 assert(p);
3953 assert(named_iofds);
3954
3955 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3956 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3957 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3958
3959 for (size_t i = 0; i < 3; i++)
3960 stdio_fdname[i] = exec_context_fdname(c, i);
3961
3962 n_fds = p->n_storage_fds + p->n_socket_fds;
3963
3964 for (size_t i = 0; i < n_fds && targets > 0; i++)
3965 if (named_iofds[STDIN_FILENO] < 0 &&
3966 c->std_input == EXEC_INPUT_NAMED_FD &&
3967 stdio_fdname[STDIN_FILENO] &&
3968 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3969
3970 named_iofds[STDIN_FILENO] = p->fds[i];
3971 targets--;
3972
3973 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3974 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3975 stdio_fdname[STDOUT_FILENO] &&
3976 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3977
3978 named_iofds[STDOUT_FILENO] = p->fds[i];
3979 targets--;
3980
3981 } else if (named_iofds[STDERR_FILENO] < 0 &&
3982 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3983 stdio_fdname[STDERR_FILENO] &&
3984 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3985
3986 named_iofds[STDERR_FILENO] = p->fds[i];
3987 targets--;
3988 }
3989
3990 return targets == 0 ? 0 : -ENOENT;
3991 }
3992
3993 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3994 if (!shared)
3995 return;
3996
3997 safe_close_pair(shared->netns_storage_socket);
3998 safe_close_pair(shared->ipcns_storage_socket);
3999 }
4000
4001 static void exec_runtime_close(ExecRuntime *rt) {
4002 if (!rt)
4003 return;
4004
4005 safe_close_pair(rt->ephemeral_storage_socket);
4006
4007 exec_shared_runtime_close(rt->shared);
4008 dynamic_creds_close(rt->dynamic_creds);
4009 }
4010
4011 static void exec_params_close(ExecParameters *p) {
4012 if (!p)
4013 return;
4014
4015 p->stdin_fd = safe_close(p->stdin_fd);
4016 p->stdout_fd = safe_close(p->stdout_fd);
4017 p->stderr_fd = safe_close(p->stderr_fd);
4018 }
4019
4020 int exec_invoke(
4021 const ExecCommand *command,
4022 const ExecContext *context,
4023 ExecParameters *params,
4024 ExecRuntime *runtime,
4025 const CGroupContext *cgroup_context,
4026 int *exit_status) {
4027
4028 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4029 int r, ngids = 0;
4030 _cleanup_free_ gid_t *supplementary_gids = NULL;
4031 const char *username = NULL, *groupname = NULL;
4032 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4033 const char *home = NULL, *shell = NULL;
4034 char **final_argv = NULL;
4035 dev_t journal_stream_dev = 0;
4036 ino_t journal_stream_ino = 0;
4037 bool userns_set_up = false;
4038 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4039 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4040 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4041 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
4042 bool keep_seccomp_privileges = false;
4043 #if HAVE_SELINUX
4044 _cleanup_free_ char *mac_selinux_context_net = NULL;
4045 bool use_selinux = false;
4046 #endif
4047 #if ENABLE_SMACK
4048 bool use_smack = false;
4049 #endif
4050 #if HAVE_APPARMOR
4051 bool use_apparmor = false;
4052 #endif
4053 #if HAVE_SECCOMP
4054 uint64_t saved_bset = 0;
4055 #endif
4056 uid_t saved_uid = getuid();
4057 gid_t saved_gid = getgid();
4058 uid_t uid = UID_INVALID;
4059 gid_t gid = GID_INVALID;
4060 size_t n_fds, /* fds to pass to the child */
4061 n_keep_fds; /* total number of fds not to close */
4062 int secure_bits;
4063 _cleanup_free_ gid_t *gids_after_pam = NULL;
4064 int ngids_after_pam = 0;
4065
4066 int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
4067 size_t n_storage_fds, n_socket_fds;
4068
4069 assert(command);
4070 assert(context);
4071 assert(params);
4072 assert(exit_status);
4073
4074 /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
4075 * and is already applied earlier. Just for safety. */
4076 if (context->log_level_max >= 0)
4077 log_set_max_level(context->log_level_max);
4078
4079 /* Explicitly test for CVE-2021-4034 inspired invocations */
4080 if (!command->path || strv_isempty(command->argv)) {
4081 *exit_status = EXIT_EXEC;
4082 return log_exec_error_errno(
4083 context,
4084 params,
4085 SYNTHETIC_ERRNO(EINVAL),
4086 "Invalid command line arguments.");
4087 }
4088
4089 LOG_CONTEXT_PUSH_EXEC(context, params);
4090
4091 if (context->std_input == EXEC_INPUT_SOCKET ||
4092 context->std_output == EXEC_OUTPUT_SOCKET ||
4093 context->std_error == EXEC_OUTPUT_SOCKET) {
4094
4095 if (params->n_socket_fds > 1)
4096 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4097
4098 if (params->n_socket_fds == 0)
4099 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4100
4101 socket_fd = params->fds[0];
4102 n_storage_fds = n_socket_fds = 0;
4103 } else {
4104 n_socket_fds = params->n_socket_fds;
4105 n_storage_fds = params->n_storage_fds;
4106 }
4107 n_fds = n_socket_fds + n_storage_fds;
4108
4109 r = exec_context_named_iofds(context, params, named_iofds);
4110 if (r < 0)
4111 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
4112
4113 rename_process_from_path(command->path);
4114
4115 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4116 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4117 * both of which will be demoted to SIG_DFL. */
4118 (void) default_signals(SIGNALS_CRASH_HANDLER,
4119 SIGNALS_IGNORE);
4120
4121 if (context->ignore_sigpipe)
4122 (void) ignore_signals(SIGPIPE);
4123
4124 r = reset_signal_mask();
4125 if (r < 0) {
4126 *exit_status = EXIT_SIGNAL_MASK;
4127 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4128 }
4129
4130 if (params->idle_pipe)
4131 do_idle_pipe_dance(params->idle_pipe);
4132
4133 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4134 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4135 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4136 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4137
4138 log_forget_fds();
4139 log_set_open_when_needed(true);
4140 log_settle_target();
4141
4142 /* In case anything used libc syslog(), close this here, too */
4143 closelog();
4144
4145 r = collect_open_file_fds(context, params, &n_fds);
4146 if (r < 0) {
4147 *exit_status = EXIT_FDS;
4148 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4149 }
4150
4151 int keep_fds[n_fds + 3];
4152 memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4153 n_keep_fds = n_fds;
4154
4155 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4156 if (r < 0) {
4157 *exit_status = EXIT_FDS;
4158 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4159 }
4160
4161 #if HAVE_LIBBPF
4162 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
4163 if (r < 0) {
4164 *exit_status = EXIT_FDS;
4165 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4166 }
4167 #endif
4168
4169 r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4170 if (r < 0) {
4171 *exit_status = EXIT_FDS;
4172 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4173 }
4174
4175 if (!context->same_pgrp &&
4176 setsid() < 0) {
4177 *exit_status = EXIT_SETSID;
4178 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4179 }
4180
4181 exec_context_tty_reset(context, params);
4182
4183 if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4184 _cleanup_free_ char *cmdline = NULL;
4185
4186 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4187 if (!cmdline) {
4188 *exit_status = EXIT_MEMORY;
4189 return log_oom();
4190 }
4191
4192 r = ask_for_confirmation(context, params, cmdline);
4193 if (r != CONFIRM_EXECUTE) {
4194 if (r == CONFIRM_PRETEND_SUCCESS) {
4195 *exit_status = EXIT_SUCCESS;
4196 return 0;
4197 }
4198
4199 *exit_status = EXIT_CONFIRM;
4200 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4201 "Execution cancelled by the user");
4202 }
4203 }
4204
4205 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4206 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4207 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4208 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4209 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4210 if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4211 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4212 *exit_status = EXIT_MEMORY;
4213 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4214 }
4215
4216 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4217 _cleanup_strv_free_ char **suggested_paths = NULL;
4218
4219 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4220 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4221 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4222 *exit_status = EXIT_USER;
4223 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4224 }
4225
4226 r = compile_suggested_paths(context, params, &suggested_paths);
4227 if (r < 0) {
4228 *exit_status = EXIT_MEMORY;
4229 return log_oom();
4230 }
4231
4232 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4233 if (r < 0) {
4234 *exit_status = EXIT_USER;
4235 if (r == -EILSEQ)
4236 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4237 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4238 return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4239 }
4240
4241 if (!uid_is_valid(uid)) {
4242 *exit_status = EXIT_USER;
4243 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4244 }
4245
4246 if (!gid_is_valid(gid)) {
4247 *exit_status = EXIT_USER;
4248 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4249 }
4250
4251 if (runtime->dynamic_creds->user)
4252 username = runtime->dynamic_creds->user->name;
4253
4254 } else {
4255 if (context->user) {
4256 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4257 if (r < 0) {
4258 *exit_status = EXIT_USER;
4259 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4260 }
4261 }
4262
4263 if (context->group) {
4264 r = get_fixed_group(context->group, &groupname, &gid);
4265 if (r < 0) {
4266 *exit_status = EXIT_GROUP;
4267 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4268 }
4269 }
4270 }
4271
4272 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4273 r = get_supplementary_groups(context, username, groupname, gid,
4274 &supplementary_gids, &ngids);
4275 if (r < 0) {
4276 *exit_status = EXIT_GROUP;
4277 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4278 }
4279
4280 r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4281 if (r < 0) {
4282 *exit_status = EXIT_USER;
4283 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4284 }
4285
4286 params->user_lookup_fd = safe_close(params->user_lookup_fd);
4287
4288 r = acquire_home(context, uid, &home, &home_buffer);
4289 if (r < 0) {
4290 *exit_status = EXIT_CHDIR;
4291 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4292 }
4293
4294 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4295 if (socket_fd >= 0)
4296 (void) fd_nonblock(socket_fd, false);
4297
4298 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4299 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4300 if (params->cgroup_path) {
4301 _cleanup_free_ char *p = NULL;
4302
4303 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4304 if (r < 0) {
4305 *exit_status = EXIT_CGROUP;
4306 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4307 }
4308
4309 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4310 if (r == -EUCLEAN) {
4311 *exit_status = EXIT_CGROUP;
4312 return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4313 "because the cgroup or one of its parents or "
4314 "siblings is in the threaded mode: %m", p);
4315 }
4316 if (r < 0) {
4317 *exit_status = EXIT_CGROUP;
4318 return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4319 }
4320 }
4321
4322 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4323 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4324 if (r < 0) {
4325 *exit_status = EXIT_NETWORK;
4326 return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4327 }
4328 }
4329
4330 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4331 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4332 if (r < 0) {
4333 *exit_status = EXIT_NAMESPACE;
4334 return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4335 }
4336 }
4337
4338 r = setup_input(context, params, socket_fd, named_iofds);
4339 if (r < 0) {
4340 *exit_status = EXIT_STDIN;
4341 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4342 }
4343
4344 r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4345 if (r < 0) {
4346 *exit_status = EXIT_STDOUT;
4347 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4348 }
4349
4350 r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4351 if (r < 0) {
4352 *exit_status = EXIT_STDERR;
4353 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4354 }
4355
4356 if (context->oom_score_adjust_set) {
4357 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4358 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4359 r = set_oom_score_adjust(context->oom_score_adjust);
4360 if (ERRNO_IS_NEG_PRIVILEGE(r))
4361 log_exec_debug_errno(context, params, r,
4362 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4363 else if (r < 0) {
4364 *exit_status = EXIT_OOM_ADJUST;
4365 return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4366 }
4367 }
4368
4369 if (context->coredump_filter_set) {
4370 r = set_coredump_filter(context->coredump_filter);
4371 if (ERRNO_IS_NEG_PRIVILEGE(r))
4372 log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4373 else if (r < 0) {
4374 *exit_status = EXIT_LIMITS;
4375 return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4376 }
4377 }
4378
4379 if (context->nice_set) {
4380 r = setpriority_closest(context->nice);
4381 if (r < 0) {
4382 *exit_status = EXIT_NICE;
4383 return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4384 }
4385 }
4386
4387 if (context->cpu_sched_set) {
4388 struct sched_param param = {
4389 .sched_priority = context->cpu_sched_priority,
4390 };
4391
4392 r = sched_setscheduler(0,
4393 context->cpu_sched_policy |
4394 (context->cpu_sched_reset_on_fork ?
4395 SCHED_RESET_ON_FORK : 0),
4396 &param);
4397 if (r < 0) {
4398 *exit_status = EXIT_SETSCHEDULER;
4399 return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4400 }
4401 }
4402
4403 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4404 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4405 const CPUSet *cpu_set;
4406
4407 if (context->cpu_affinity_from_numa) {
4408 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4409 if (r < 0) {
4410 *exit_status = EXIT_CPUAFFINITY;
4411 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4412 }
4413
4414 cpu_set = &converted_cpu_set;
4415 } else
4416 cpu_set = &context->cpu_set;
4417
4418 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4419 *exit_status = EXIT_CPUAFFINITY;
4420 return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4421 }
4422 }
4423
4424 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4425 r = apply_numa_policy(&context->numa_policy);
4426 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4427 log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4428 else if (r < 0) {
4429 *exit_status = EXIT_NUMA_POLICY;
4430 return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4431 }
4432 }
4433
4434 if (context->ioprio_set)
4435 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4436 *exit_status = EXIT_IOPRIO;
4437 return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4438 }
4439
4440 if (context->timer_slack_nsec != NSEC_INFINITY)
4441 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4442 *exit_status = EXIT_TIMERSLACK;
4443 return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4444 }
4445
4446 if (context->personality != PERSONALITY_INVALID) {
4447 r = safe_personality(context->personality);
4448 if (r < 0) {
4449 *exit_status = EXIT_PERSONALITY;
4450 return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4451 }
4452 }
4453
4454 #if ENABLE_UTMP
4455 if (context->utmp_id) {
4456 const char *line = context->tty_path ?
4457 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4458 NULL;
4459 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4460 line,
4461 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4462 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4463 USER_PROCESS,
4464 username);
4465 }
4466 #endif
4467
4468 if (uid_is_valid(uid)) {
4469 r = chown_terminal(STDIN_FILENO, uid);
4470 if (r < 0) {
4471 *exit_status = EXIT_STDIN;
4472 return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4473 }
4474 }
4475
4476 if (params->cgroup_path) {
4477 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4478 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4479 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4480 * touch a single hierarchy too. */
4481
4482 if (params->flags & EXEC_CGROUP_DELEGATE) {
4483 _cleanup_free_ char *p = NULL;
4484
4485 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4486 if (r < 0) {
4487 *exit_status = EXIT_CGROUP;
4488 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4489 }
4490
4491 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4492 if (r < 0) {
4493 *exit_status = EXIT_CGROUP;
4494 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4495 }
4496 if (r > 0) {
4497 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4498 if (r < 0) {
4499 *exit_status = EXIT_CGROUP;
4500 return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4501 }
4502 }
4503 }
4504
4505 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4506 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4507 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4508 if (r < 0) {
4509 *exit_status = EXIT_MEMORY;
4510 return log_oom();
4511 }
4512
4513 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4514 if (r < 0) {
4515 log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4516 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4517 memory_pressure_path = mfree(memory_pressure_path);
4518 }
4519 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4520 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4521 if (!memory_pressure_path) {
4522 *exit_status = EXIT_MEMORY;
4523 return log_oom();
4524 }
4525 }
4526 }
4527 }
4528
4529 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4530
4531 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4532 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4533 if (r < 0)
4534 return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4535 }
4536
4537 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4538 r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4539 if (r < 0) {
4540 *exit_status = EXIT_CREDENTIALS;
4541 return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4542 }
4543 }
4544
4545 r = build_environment(
4546 context,
4547 params,
4548 cgroup_context,
4549 n_fds,
4550 home,
4551 username,
4552 shell,
4553 journal_stream_dev,
4554 journal_stream_ino,
4555 memory_pressure_path,
4556 &our_env);
4557 if (r < 0) {
4558 *exit_status = EXIT_MEMORY;
4559 return log_oom();
4560 }
4561
4562 r = build_pass_environment(context, &pass_env);
4563 if (r < 0) {
4564 *exit_status = EXIT_MEMORY;
4565 return log_oom();
4566 }
4567
4568 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4569 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4570 * not specify PATH but the unit has ExecSearchPath. */
4571 if (!strv_isempty(context->exec_search_path)) {
4572 _cleanup_free_ char *joined = NULL;
4573
4574 joined = strv_join(context->exec_search_path, ":");
4575 if (!joined) {
4576 *exit_status = EXIT_MEMORY;
4577 return log_oom();
4578 }
4579
4580 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4581 if (r < 0) {
4582 *exit_status = EXIT_MEMORY;
4583 return log_oom();
4584 }
4585 }
4586
4587 accum_env = strv_env_merge(params->environment,
4588 our_env,
4589 joined_exec_search_path,
4590 pass_env,
4591 context->environment,
4592 params->files_env);
4593 if (!accum_env) {
4594 *exit_status = EXIT_MEMORY;
4595 return log_oom();
4596 }
4597 accum_env = strv_env_clean(accum_env);
4598
4599 (void) umask(context->umask);
4600
4601 r = setup_keyring(context, params, uid, gid);
4602 if (r < 0) {
4603 *exit_status = EXIT_KEYRING;
4604 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4605 }
4606
4607 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4608 * from it. */
4609 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4610
4611 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4612 * for it, and the kernel doesn't actually support ambient caps. */
4613 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4614
4615 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4616 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4617 * desired. */
4618 if (needs_ambient_hack)
4619 needs_setuid = false;
4620 else
4621 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4622
4623 uint64_t capability_ambient_set = context->capability_ambient_set;
4624
4625 if (needs_sandboxing) {
4626 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4627 * /sys being present. The actual MAC context application will happen later, as late as
4628 * possible, to avoid impacting our own code paths. */
4629
4630 #if HAVE_SELINUX
4631 use_selinux = mac_selinux_use();
4632 #endif
4633 #if ENABLE_SMACK
4634 use_smack = mac_smack_use();
4635 #endif
4636 #if HAVE_APPARMOR
4637 use_apparmor = mac_apparmor_use();
4638 #endif
4639 }
4640
4641 if (needs_sandboxing) {
4642 int which_failed;
4643
4644 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4645 * is set here. (See below.) */
4646
4647 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4648 if (r < 0) {
4649 *exit_status = EXIT_LIMITS;
4650 return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4651 }
4652 }
4653
4654 if (needs_setuid && context->pam_name && username) {
4655 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4656 * wins here. (See above.) */
4657
4658 /* All fds passed in the fds array will be closed in the pam child process. */
4659 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds, params->exec_fd);
4660 if (r < 0) {
4661 *exit_status = EXIT_PAM;
4662 return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4663 }
4664
4665 if (ambient_capabilities_supported()) {
4666 uint64_t ambient_after_pam;
4667
4668 /* PAM modules might have set some ambient caps. Query them here and merge them into
4669 * the caps we want to set in the end, so that we don't end up unsetting them. */
4670 r = capability_get_ambient(&ambient_after_pam);
4671 if (r < 0) {
4672 *exit_status = EXIT_CAPABILITIES;
4673 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4674 }
4675
4676 capability_ambient_set |= ambient_after_pam;
4677 }
4678
4679 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4680 if (ngids_after_pam < 0) {
4681 *exit_status = EXIT_GROUP;
4682 return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4683 }
4684 }
4685
4686 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4687 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4688 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4689 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4690
4691 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4692 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4693 * the actual requested operations fail (or silently continue). */
4694 if (r < 0 && context->private_users) {
4695 *exit_status = EXIT_USER;
4696 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4697 }
4698 if (r < 0)
4699 log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4700 else
4701 userns_set_up = true;
4702 }
4703
4704 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4705
4706 /* Try to enable network namespacing if network namespacing is available and we have
4707 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4708 * new network namespace. And if we don't have that, then we could only create a network
4709 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4710 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4711 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4712 if (ERRNO_IS_NEG_PRIVILEGE(r))
4713 log_exec_notice_errno(context, params, r,
4714 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4715 else if (r < 0) {
4716 *exit_status = EXIT_NETWORK;
4717 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4718 }
4719 } else if (context->network_namespace_path) {
4720 *exit_status = EXIT_NETWORK;
4721 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4722 "NetworkNamespacePath= is not supported, refusing.");
4723 } else
4724 log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4725 }
4726
4727 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4728
4729 if (ns_type_supported(NAMESPACE_IPC)) {
4730 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4731 if (r == -EPERM)
4732 log_exec_warning_errno(context, params, r,
4733 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4734 else if (r < 0) {
4735 *exit_status = EXIT_NAMESPACE;
4736 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4737 }
4738 } else if (context->ipc_namespace_path) {
4739 *exit_status = EXIT_NAMESPACE;
4740 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4741 "IPCNamespacePath= is not supported, refusing.");
4742 } else
4743 log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4744 }
4745
4746 if (needs_mount_namespace) {
4747 _cleanup_free_ char *error_path = NULL;
4748
4749 r = apply_mount_namespace(command->flags,
4750 context,
4751 params,
4752 runtime,
4753 memory_pressure_path,
4754 needs_sandboxing,
4755 &error_path);
4756 if (r < 0) {
4757 *exit_status = EXIT_NAMESPACE;
4758 return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4759 error_path ? ": " : "", strempty(error_path));
4760 }
4761 }
4762
4763 if (needs_sandboxing) {
4764 r = apply_protect_hostname(context, params, exit_status);
4765 if (r < 0)
4766 return r;
4767 }
4768
4769 if (context->memory_ksm >= 0)
4770 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4771 if (ERRNO_IS_NOT_SUPPORTED(errno))
4772 log_exec_debug_errno(context,
4773 params,
4774 errno,
4775 "KSM support not available, ignoring.");
4776 else {
4777 *exit_status = EXIT_KSM;
4778 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4779 }
4780 }
4781
4782 /* Drop groups as early as possible.
4783 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4784 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4785 if (needs_setuid) {
4786 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4787 int ngids_to_enforce = 0;
4788
4789 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4790 ngids,
4791 gids_after_pam,
4792 ngids_after_pam,
4793 &gids_to_enforce);
4794 if (ngids_to_enforce < 0) {
4795 *exit_status = EXIT_GROUP;
4796 return log_exec_error_errno(context, params,
4797 ngids_to_enforce,
4798 "Failed to merge group lists. Group membership might be incorrect: %m");
4799 }
4800
4801 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4802 if (r < 0) {
4803 *exit_status = EXIT_GROUP;
4804 return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4805 }
4806 }
4807
4808 /* If the user namespace was not set up above, try to do it now.
4809 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4810 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4811 * case of mount namespaces being less privileged when the mount point list is copied from a
4812 * different user namespace). */
4813
4814 if (needs_sandboxing && context->private_users && !userns_set_up) {
4815 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4816 if (r < 0) {
4817 *exit_status = EXIT_USER;
4818 return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4819 }
4820 }
4821
4822 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4823 * shall execute. */
4824
4825 _cleanup_free_ char *executable = NULL;
4826 _cleanup_close_ int executable_fd = -EBADF;
4827 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4828 if (r < 0) {
4829 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4830 log_exec_struct_errno(context, params, LOG_INFO, r,
4831 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4832 LOG_EXEC_INVOCATION_ID(params),
4833 LOG_EXEC_MESSAGE(params,
4834 "Executable %s missing, skipping: %m",
4835 command->path),
4836 "EXECUTABLE=%s", command->path);
4837 *exit_status = EXIT_SUCCESS;
4838 return 0;
4839 }
4840
4841 *exit_status = EXIT_EXEC;
4842 return log_exec_struct_errno(context, params, LOG_INFO, r,
4843 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4844 LOG_EXEC_INVOCATION_ID(params),
4845 LOG_EXEC_MESSAGE(params,
4846 "Failed to locate executable %s: %m",
4847 command->path),
4848 "EXECUTABLE=%s", command->path);
4849 }
4850
4851 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
4852 if (r < 0) {
4853 *exit_status = EXIT_FDS;
4854 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4855 }
4856
4857 #if HAVE_SELINUX
4858 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4859 int fd = -EBADF;
4860
4861 if (socket_fd >= 0)
4862 fd = socket_fd;
4863 else if (params->n_socket_fds == 1)
4864 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4865 * use context from that fd to compute the label. */
4866 fd = params->fds[0];
4867
4868 if (fd >= 0) {
4869 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4870 if (r < 0) {
4871 if (!context->selinux_context_ignore) {
4872 *exit_status = EXIT_SELINUX_CONTEXT;
4873 return log_exec_error_errno(context,
4874 params,
4875 r,
4876 "Failed to determine SELinux context: %m");
4877 }
4878 log_exec_debug_errno(context,
4879 params,
4880 r,
4881 "Failed to determine SELinux context, ignoring: %m");
4882 }
4883 }
4884 }
4885 #endif
4886
4887 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4888 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4889 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4890 * execve(). But first, close the remaining sockets in the context objects. */
4891
4892 exec_runtime_close(runtime);
4893 exec_params_close(params);
4894
4895 r = close_all_fds(keep_fds, n_keep_fds);
4896 if (r >= 0)
4897 r = shift_fds(params->fds, n_fds);
4898 if (r >= 0)
4899 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
4900 if (r < 0) {
4901 *exit_status = EXIT_FDS;
4902 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4903 }
4904
4905 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4906 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4907 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4908 * came this far. */
4909
4910 secure_bits = context->secure_bits;
4911
4912 if (needs_sandboxing) {
4913 uint64_t bset;
4914
4915 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4916 * (Note this is placed after the general resource limit initialization, see above, in order
4917 * to take precedence.) */
4918 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4919 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4920 *exit_status = EXIT_LIMITS;
4921 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4922 }
4923 }
4924
4925 #if ENABLE_SMACK
4926 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4927 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4928 if (use_smack && context->smack_process_label) {
4929 r = setup_smack(params, context, executable_fd);
4930 if (r < 0 && !context->smack_process_label_ignore) {
4931 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4932 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4933 }
4934 }
4935 #endif
4936
4937 bset = context->capability_bounding_set;
4938 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4939 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4940 * instead of us doing that */
4941 if (needs_ambient_hack)
4942 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4943 (UINT64_C(1) << CAP_SETUID) |
4944 (UINT64_C(1) << CAP_SETGID);
4945
4946 #if HAVE_SECCOMP
4947 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4948 * keep the needed privileges to apply it even if we're not root. */
4949 if (needs_setuid &&
4950 uid_is_valid(uid) &&
4951 context_has_seccomp(context) &&
4952 seccomp_allows_drop_privileges(context)) {
4953 keep_seccomp_privileges = true;
4954
4955 if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4956 *exit_status = EXIT_USER;
4957 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4958 }
4959
4960 /* Save the current bounding set so we can restore it after applying the seccomp
4961 * filter */
4962 saved_bset = bset;
4963 bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4964 (UINT64_C(1) << CAP_SETPCAP);
4965 }
4966 #endif
4967
4968 if (!cap_test_all(bset)) {
4969 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4970 if (r < 0) {
4971 *exit_status = EXIT_CAPABILITIES;
4972 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4973 }
4974 }
4975
4976 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4977 * keep-caps set.
4978 *
4979 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4980 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4981 * the ambient capabilities can be raised as they are present in the permitted and
4982 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4983 * without changing the user, so we also set the ambient capabilities here.
4984 *
4985 * The requested ambient capabilities are raised in the inheritable set if the second
4986 * argument is true. */
4987 if (!needs_ambient_hack) {
4988 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4989 if (r < 0) {
4990 *exit_status = EXIT_CAPABILITIES;
4991 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4992 }
4993 }
4994 }
4995
4996 /* chroot to root directory first, before we lose the ability to chroot */
4997 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4998 if (r < 0)
4999 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
5000
5001 if (needs_setuid) {
5002 if (uid_is_valid(uid)) {
5003 r = enforce_user(context, uid, capability_ambient_set);
5004 if (r < 0) {
5005 *exit_status = EXIT_USER;
5006 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
5007 }
5008
5009 if (keep_seccomp_privileges) {
5010 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
5011 r = drop_capability(CAP_SETUID);
5012 if (r < 0) {
5013 *exit_status = EXIT_USER;
5014 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
5015 }
5016 }
5017
5018 r = keep_capability(CAP_SYS_ADMIN);
5019 if (r < 0) {
5020 *exit_status = EXIT_USER;
5021 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
5022 }
5023
5024 r = keep_capability(CAP_SETPCAP);
5025 if (r < 0) {
5026 *exit_status = EXIT_USER;
5027 return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
5028 }
5029 }
5030
5031 if (!needs_ambient_hack && capability_ambient_set != 0) {
5032
5033 /* Raise the ambient capabilities after user change. */
5034 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5035 if (r < 0) {
5036 *exit_status = EXIT_CAPABILITIES;
5037 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
5038 }
5039 }
5040 }
5041 }
5042
5043 /* Apply working directory here, because the working directory might be on NFS and only the user
5044 * running this service might have the correct privilege to change to the working directory. Also, it
5045 * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5046 * the cwd cannot be used to pin directories outside of the sandbox. */
5047 r = apply_working_directory(context, params, runtime, home, exit_status);
5048 if (r < 0)
5049 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
5050
5051 if (needs_sandboxing) {
5052 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5053 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5054 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5055 * are restricted. */
5056
5057 #if HAVE_SELINUX
5058 if (use_selinux) {
5059 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5060
5061 if (exec_context) {
5062 r = setexeccon(exec_context);
5063 if (r < 0) {
5064 if (!context->selinux_context_ignore) {
5065 *exit_status = EXIT_SELINUX_CONTEXT;
5066 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
5067 }
5068 log_exec_debug_errno(context,
5069 params,
5070 r,
5071 "Failed to change SELinux context to %s, ignoring: %m",
5072 exec_context);
5073 }
5074 }
5075 }
5076 #endif
5077
5078 #if HAVE_APPARMOR
5079 if (use_apparmor && context->apparmor_profile) {
5080 r = aa_change_onexec(context->apparmor_profile);
5081 if (r < 0 && !context->apparmor_profile_ignore) {
5082 *exit_status = EXIT_APPARMOR_PROFILE;
5083 return log_exec_error_errno(context,
5084 params,
5085 errno,
5086 "Failed to prepare AppArmor profile change to %s: %m",
5087 context->apparmor_profile);
5088 }
5089 }
5090 #endif
5091
5092 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5093 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5094 * requires CAP_SETPCAP. */
5095 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5096 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5097 * effective set here.
5098 *
5099 * The effective set is overwritten during execve() with the following values:
5100 *
5101 * - ambient set (for non-root processes)
5102 *
5103 * - (inheritable | bounding) set for root processes)
5104 *
5105 * Hence there is no security impact to raise it in the effective set before execve
5106 */
5107 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5108 if (r < 0) {
5109 *exit_status = EXIT_CAPABILITIES;
5110 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5111 }
5112 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5113 *exit_status = EXIT_SECUREBITS;
5114 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
5115 }
5116 }
5117
5118 if (context_has_no_new_privileges(context))
5119 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5120 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5121 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
5122 }
5123
5124 #if HAVE_SECCOMP
5125 r = apply_address_families(context, params);
5126 if (r < 0) {
5127 *exit_status = EXIT_ADDRESS_FAMILIES;
5128 return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
5129 }
5130
5131 r = apply_memory_deny_write_execute(context, params);
5132 if (r < 0) {
5133 *exit_status = EXIT_SECCOMP;
5134 return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5135 }
5136
5137 r = apply_restrict_realtime(context, params);
5138 if (r < 0) {
5139 *exit_status = EXIT_SECCOMP;
5140 return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5141 }
5142
5143 r = apply_restrict_suid_sgid(context, params);
5144 if (r < 0) {
5145 *exit_status = EXIT_SECCOMP;
5146 return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5147 }
5148
5149 r = apply_restrict_namespaces(context, params);
5150 if (r < 0) {
5151 *exit_status = EXIT_SECCOMP;
5152 return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5153 }
5154
5155 r = apply_protect_sysctl(context, params);
5156 if (r < 0) {
5157 *exit_status = EXIT_SECCOMP;
5158 return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5159 }
5160
5161 r = apply_protect_kernel_modules(context, params);
5162 if (r < 0) {
5163 *exit_status = EXIT_SECCOMP;
5164 return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5165 }
5166
5167 r = apply_protect_kernel_logs(context, params);
5168 if (r < 0) {
5169 *exit_status = EXIT_SECCOMP;
5170 return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5171 }
5172
5173 r = apply_protect_clock(context, params);
5174 if (r < 0) {
5175 *exit_status = EXIT_SECCOMP;
5176 return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5177 }
5178
5179 r = apply_private_devices(context, params);
5180 if (r < 0) {
5181 *exit_status = EXIT_SECCOMP;
5182 return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5183 }
5184
5185 r = apply_syscall_archs(context, params);
5186 if (r < 0) {
5187 *exit_status = EXIT_SECCOMP;
5188 return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5189 }
5190
5191 r = apply_lock_personality(context, params);
5192 if (r < 0) {
5193 *exit_status = EXIT_SECCOMP;
5194 return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5195 }
5196
5197 r = apply_syscall_log(context, params);
5198 if (r < 0) {
5199 *exit_status = EXIT_SECCOMP;
5200 return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5201 }
5202 #endif
5203
5204 #if HAVE_LIBBPF
5205 r = apply_restrict_filesystems(context, params);
5206 if (r < 0) {
5207 *exit_status = EXIT_BPF;
5208 return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5209 }
5210 #endif
5211
5212 #if HAVE_SECCOMP
5213 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5214 * by the filter as little as possible. */
5215 r = apply_syscall_filter(context, params, needs_ambient_hack);
5216 if (r < 0) {
5217 *exit_status = EXIT_SECCOMP;
5218 return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5219 }
5220
5221 if (keep_seccomp_privileges) {
5222 /* Restore the capability bounding set with what's expected from the service + the
5223 * ambient capabilities hack */
5224 if (!cap_test_all(saved_bset)) {
5225 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5226 if (r < 0) {
5227 *exit_status = EXIT_CAPABILITIES;
5228 return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5229 }
5230 }
5231
5232 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5233 * applications that use it. */
5234 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5235 r = drop_capability(CAP_SYS_ADMIN);
5236 if (r < 0) {
5237 *exit_status = EXIT_USER;
5238 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5239 }
5240 }
5241
5242 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5243 * applications that use it. */
5244 if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5245 r = drop_capability(CAP_SETPCAP);
5246 if (r < 0) {
5247 *exit_status = EXIT_USER;
5248 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5249 }
5250 }
5251
5252 if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5253 *exit_status = EXIT_USER;
5254 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5255 }
5256 }
5257 #endif
5258
5259 }
5260
5261 if (!strv_isempty(context->unset_environment)) {
5262 char **ee = NULL;
5263
5264 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5265 if (!ee) {
5266 *exit_status = EXIT_MEMORY;
5267 return log_oom();
5268 }
5269
5270 strv_free_and_replace(accum_env, ee);
5271 }
5272
5273 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5274 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5275
5276 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5277 if (r < 0) {
5278 *exit_status = EXIT_MEMORY;
5279 return log_exec_error_errno(context,
5280 params,
5281 r,
5282 "Failed to replace environment variables: %m");
5283 }
5284 final_argv = replaced_argv;
5285
5286 if (!strv_isempty(unset_variables)) {
5287 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5288 log_exec_warning(context,
5289 params,
5290 "Referenced but unset environment variable evaluates to an empty string: %s",
5291 strna(ju));
5292 }
5293
5294 if (!strv_isempty(bad_variables)) {
5295 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5296 log_exec_warning(context,
5297 params,
5298 "Invalid environment variable name evaluates to an empty string: %s",
5299 strna(jb));
5300 }
5301 } else
5302 final_argv = command->argv;
5303
5304 log_command_line(context, params, "Executing", executable, final_argv);
5305
5306 if (params->exec_fd >= 0) {
5307 uint8_t hot = 1;
5308
5309 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5310 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5311
5312 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5313 *exit_status = EXIT_EXEC;
5314 return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5315 }
5316 }
5317
5318 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5319
5320 if (params->exec_fd >= 0) {
5321 uint8_t hot = 0;
5322
5323 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5324 * that POLLHUP on it no longer means execve() succeeded. */
5325
5326 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5327 *exit_status = EXIT_EXEC;
5328 return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5329 }
5330 }
5331
5332 *exit_status = EXIT_EXEC;
5333 return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5334 }