]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/exec-invoke.c
ec6729ae37b823740b10af71f0cd42099a208313
[thirdparty/systemd.git] / src / core / exec-invoke.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <grp.h>
4 #include <linux/ioprio.h>
5 #include <linux/prctl.h>
6 #include <linux/sched.h>
7 #include <linux/securebits.h>
8 #include <poll.h>
9 #include <sys/eventfd.h>
10 #include <sys/ioctl.h>
11 #include <sys/mount.h>
12 #include <sys/prctl.h>
13 #include <unistd.h>
14
15 #if HAVE_PAM
16 #include <security/pam_appl.h>
17 #endif
18
19 #include "sd-messages.h"
20
21 #include "apparmor-util.h"
22 #include "argv-util.h"
23 #include "ask-password-api.h"
24 #include "barrier.h"
25 #include "bitfield.h"
26 #include "bpf-dlopen.h"
27 #include "bpf-restrict-fs.h"
28 #include "btrfs-util.h"
29 #include "capability-util.h"
30 #include "cgroup-setup.h"
31 #include "cgroup.h"
32 #include "chase.h"
33 #include "chown-recursive.h"
34 #include "constants.h"
35 #include "copy.h"
36 #include "coredump-util.h"
37 #include "dissect-image.h"
38 #include "dynamic-user.h"
39 #include "env-util.h"
40 #include "escape.h"
41 #include "exec-credential.h"
42 #include "exec-invoke.h"
43 #include "execute.h"
44 #include "exit-status.h"
45 #include "fd-util.h"
46 #include "fs-util.h"
47 #include "hexdecoct.h"
48 #include "hostname-setup.h"
49 #include "image-policy.h"
50 #include "io-util.h"
51 #include "iovec-util.h"
52 #include "journal-send.h"
53 #include "manager.h"
54 #include "memfd-util.h"
55 #include "missing_sched.h"
56 #include "missing_syscall.h"
57 #include "mkdir-label.h"
58 #include "mount-util.h"
59 #include "namespace-util.h"
60 #include "nsflags.h"
61 #include "open-file.h"
62 #include "osc-context.h"
63 #include "path-util.h"
64 #include "pidref.h"
65 #include "proc-cmdline.h"
66 #include "process-util.h"
67 #include "psi-util.h"
68 #include "rlimit-util.h"
69 #include "seccomp-util.h"
70 #include "selinux-util.h"
71 #include "set.h"
72 #include "signal-util.h"
73 #include "smack-util.h"
74 #include "socket-util.h"
75 #include "stat-util.h"
76 #include "string-table.h"
77 #include "strv.h"
78 #include "terminal-util.h"
79 #include "user-util.h"
80 #include "utmp-wtmp.h"
81 #include "vpick.h"
82
83 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
84 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
85
86 #define SNDBUF_SIZE (8*1024*1024)
87
88 static int flag_fds(
89 const int fds[],
90 size_t n_socket_fds,
91 size_t n_fds,
92 bool nonblock) {
93
94 int r;
95
96 assert(fds || n_fds == 0);
97
98 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
99 * O_NONBLOCK only applies to socket activation though. */
100
101 for (size_t i = 0; i < n_fds; i++) {
102
103 if (i < n_socket_fds) {
104 r = fd_nonblock(fds[i], nonblock);
105 if (r < 0)
106 return r;
107 }
108
109 /* We unconditionally drop FD_CLOEXEC from the fds,
110 * since after all we want to pass these fds to our
111 * children */
112
113 r = fd_cloexec(fds[i], false);
114 if (r < 0)
115 return r;
116 }
117
118 return 0;
119 }
120
121 static bool is_terminal_input(ExecInput i) {
122 return IN_SET(i,
123 EXEC_INPUT_TTY,
124 EXEC_INPUT_TTY_FORCE,
125 EXEC_INPUT_TTY_FAIL);
126 }
127
128 static bool is_terminal_output(ExecOutput o) {
129 return IN_SET(o,
130 EXEC_OUTPUT_TTY,
131 EXEC_OUTPUT_KMSG_AND_CONSOLE,
132 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
133 }
134
135 static bool is_kmsg_output(ExecOutput o) {
136 return IN_SET(o,
137 EXEC_OUTPUT_KMSG,
138 EXEC_OUTPUT_KMSG_AND_CONSOLE);
139 }
140
141 static int open_null_as(int flags, int nfd) {
142 int fd;
143
144 assert(nfd >= 0);
145
146 fd = open("/dev/null", flags|O_NOCTTY);
147 if (fd < 0)
148 return -errno;
149
150 return move_fd(fd, nfd, false);
151 }
152
153 static int connect_journal_socket(
154 int fd,
155 const char *log_namespace,
156 uid_t uid,
157 gid_t gid) {
158
159 uid_t olduid = UID_INVALID;
160 gid_t oldgid = GID_INVALID;
161 const char *j;
162 int r;
163
164 assert(fd >= 0);
165
166 j = journal_stream_path(log_namespace);
167 if (!j)
168 return -EINVAL;
169
170 if (gid_is_valid(gid)) {
171 oldgid = getgid();
172
173 if (setegid(gid) < 0)
174 return -errno;
175 }
176
177 if (uid_is_valid(uid)) {
178 olduid = getuid();
179
180 if (seteuid(uid) < 0) {
181 r = -errno;
182 goto restore_gid;
183 }
184 }
185
186 r = connect_unix_path(fd, AT_FDCWD, j);
187
188 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
189 an LSM interferes. */
190
191 if (uid_is_valid(uid))
192 (void) seteuid(olduid);
193
194 restore_gid:
195 if (gid_is_valid(gid))
196 (void) setegid(oldgid);
197
198 return r;
199 }
200
201 static int connect_logger_as(
202 const ExecContext *context,
203 const ExecParameters *params,
204 ExecOutput output,
205 const char *ident,
206 int nfd,
207 uid_t uid,
208 gid_t gid) {
209
210 _cleanup_close_ int fd = -EBADF;
211 int r;
212
213 assert(context);
214 assert(params);
215 assert(output < _EXEC_OUTPUT_MAX);
216 assert(ident);
217 assert(nfd >= 0);
218
219 fd = socket(AF_UNIX, SOCK_STREAM, 0);
220 if (fd < 0)
221 return -errno;
222
223 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
224 if (r < 0)
225 return r;
226
227 if (shutdown(fd, SHUT_RD) < 0)
228 return -errno;
229
230 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
231
232 if (dprintf(fd,
233 "%s\n"
234 "%s\n"
235 "%i\n"
236 "%i\n"
237 "%i\n"
238 "%i\n"
239 "%i\n",
240 context->syslog_identifier ?: ident,
241 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
242 context->syslog_priority,
243 !!context->syslog_level_prefix,
244 false,
245 is_kmsg_output(output),
246 is_terminal_output(output)) < 0)
247 return -errno;
248
249 return move_fd(TAKE_FD(fd), nfd, false);
250 }
251
252 static int open_terminal_as(const char *path, int flags, int nfd) {
253 int fd;
254
255 assert(path);
256 assert(nfd >= 0);
257
258 fd = open_terminal(path, flags | O_NOCTTY);
259 if (fd < 0)
260 return fd;
261
262 return move_fd(fd, nfd, false);
263 }
264
265 static int acquire_path(const char *path, int flags, mode_t mode) {
266 _cleanup_close_ int fd = -EBADF;
267 int r;
268
269 assert(path);
270
271 if (IN_SET(flags & O_ACCMODE_STRICT, O_WRONLY, O_RDWR))
272 flags |= O_CREAT;
273
274 fd = open(path, flags|O_NOCTTY, mode);
275 if (fd >= 0)
276 return TAKE_FD(fd);
277
278 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
279 return -errno;
280
281 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
282
283 fd = socket(AF_UNIX, SOCK_STREAM, 0);
284 if (fd < 0)
285 return -errno;
286
287 r = connect_unix_path(fd, AT_FDCWD, path);
288 if (IN_SET(r, -ENOTSOCK, -EINVAL))
289 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
290 * wasn't an AF_UNIX socket after all */
291 return -ENXIO;
292 if (r < 0)
293 return r;
294
295 if ((flags & O_ACCMODE_STRICT) == O_RDONLY)
296 r = shutdown(fd, SHUT_WR);
297 else if ((flags & O_ACCMODE_STRICT) == O_WRONLY)
298 r = shutdown(fd, SHUT_RD);
299 else
300 r = 0;
301 if (r < 0)
302 return -errno;
303
304 return TAKE_FD(fd);
305 }
306
307 static int fixup_input(
308 const ExecContext *context,
309 int socket_fd,
310 bool apply_tty_stdin) {
311
312 ExecInput std_input;
313
314 assert(context);
315
316 std_input = context->std_input;
317
318 if (is_terminal_input(std_input) && !apply_tty_stdin)
319 return EXEC_INPUT_NULL;
320
321 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
322 return EXEC_INPUT_NULL;
323
324 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
325 return EXEC_INPUT_NULL;
326
327 return std_input;
328 }
329
330 static int fixup_output(ExecOutput output, int socket_fd) {
331
332 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
333 return EXEC_OUTPUT_INHERIT;
334
335 return output;
336 }
337
338 static int setup_input(
339 const ExecContext *context,
340 const ExecParameters *params,
341 int socket_fd,
342 const int named_iofds[static 3]) {
343
344 ExecInput i;
345 int r;
346
347 assert(context);
348 assert(params);
349 assert(named_iofds);
350
351 if (params->stdin_fd >= 0) {
352 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
353 return -errno;
354
355 /* Try to make this our controlling tty, if it is a tty */
356 if (isatty_safe(STDIN_FILENO) && ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE) < 0)
357 log_debug_errno(errno, "Failed to make standard input TTY our controlling terminal: %m");
358
359 return STDIN_FILENO;
360 }
361
362 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
363
364 switch (i) {
365
366 case EXEC_INPUT_NULL:
367 return open_null_as(O_RDONLY, STDIN_FILENO);
368
369 case EXEC_INPUT_TTY:
370 case EXEC_INPUT_TTY_FORCE:
371 case EXEC_INPUT_TTY_FAIL: {
372 _cleanup_close_ int tty_fd = -EBADF;
373 _cleanup_free_ char *resolved = NULL;
374 const char *tty_path;
375
376 tty_path = ASSERT_PTR(exec_context_tty_path(context));
377
378 if (tty_is_console(tty_path)) {
379 r = resolve_dev_console(&resolved);
380 if (r < 0)
381 log_debug_errno(r, "Failed to resolve /dev/console, ignoring: %m");
382 else {
383 log_debug("Resolved /dev/console to %s", resolved);
384 tty_path = resolved;
385 }
386 }
387
388 tty_fd = acquire_terminal(tty_path,
389 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
390 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
391 ACQUIRE_TERMINAL_WAIT,
392 USEC_INFINITY);
393 if (tty_fd < 0)
394 return tty_fd;
395
396 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
397 if (r < 0)
398 return r;
399
400 TAKE_FD(tty_fd);
401 return r;
402 }
403
404 case EXEC_INPUT_SOCKET:
405 assert(socket_fd >= 0);
406
407 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
408
409 case EXEC_INPUT_NAMED_FD:
410 assert(named_iofds[STDIN_FILENO] >= 0);
411
412 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
413 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
414
415 case EXEC_INPUT_DATA: {
416 int fd;
417
418 fd = memfd_new_and_seal("exec-input", context->stdin_data, context->stdin_data_size);
419 if (fd < 0)
420 return fd;
421
422 return move_fd(fd, STDIN_FILENO, false);
423 }
424
425 case EXEC_INPUT_FILE: {
426 bool rw;
427 int fd;
428
429 assert(context->stdio_file[STDIN_FILENO]);
430
431 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
432 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
433
434 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
435 if (fd < 0)
436 return fd;
437
438 return move_fd(fd, STDIN_FILENO, false);
439 }
440
441 default:
442 assert_not_reached();
443 }
444 }
445
446 static bool can_inherit_stderr_from_stdout(
447 const ExecContext *context,
448 ExecOutput o,
449 ExecOutput e) {
450
451 assert(context);
452
453 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
454 * stderr fd */
455
456 if (e == EXEC_OUTPUT_INHERIT)
457 return true;
458 if (e != o)
459 return false;
460
461 if (e == EXEC_OUTPUT_NAMED_FD)
462 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
463
464 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
465 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
466
467 return true;
468 }
469
470 static int setup_output(
471 const ExecContext *context,
472 const ExecParameters *params,
473 int fileno,
474 int socket_fd,
475 const int named_iofds[static 3],
476 const char *ident,
477 uid_t uid,
478 gid_t gid,
479 dev_t *journal_stream_dev,
480 ino_t *journal_stream_ino) {
481
482 ExecOutput o;
483 ExecInput i;
484 int r;
485
486 assert(context);
487 assert(params);
488 assert(ident);
489 assert(journal_stream_dev);
490 assert(journal_stream_ino);
491
492 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
493
494 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
495 return -errno;
496
497 return STDOUT_FILENO;
498 }
499
500 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
501 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
502 return -errno;
503
504 return STDERR_FILENO;
505 }
506
507 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
508 o = fixup_output(context->std_output, socket_fd);
509
510 // FIXME: we probably should spend some time here to verify that if we inherit an fd from stdin
511 // (possibly indirect via inheritance from stdout) it is actually opened for write!
512
513 if (fileno == STDERR_FILENO) {
514 ExecOutput e;
515 e = fixup_output(context->std_error, socket_fd);
516
517 /* This expects the input and output are already set up */
518
519 /* Don't change the stderr file descriptor if we inherit all
520 * the way and are not on a tty */
521 if (e == EXEC_OUTPUT_INHERIT &&
522 o == EXEC_OUTPUT_INHERIT &&
523 i == EXEC_INPUT_NULL &&
524 !is_terminal_input(context->std_input) &&
525 getppid() != 1)
526 return fileno;
527
528 /* Duplicate from stdout if possible */
529 if (can_inherit_stderr_from_stdout(context, o, e))
530 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
531
532 o = e;
533
534 } else if (o == EXEC_OUTPUT_INHERIT) {
535 /* If input got downgraded, inherit the original value */
536 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
537 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
538
539 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
540 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
541 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
542
543 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
544 if (getppid() != 1)
545 return fileno;
546
547 /* We need to open /dev/null here anew, to get the right access mode. */
548 return open_null_as(O_WRONLY, fileno);
549 }
550
551 switch (o) {
552
553 case EXEC_OUTPUT_NULL:
554 return open_null_as(O_WRONLY, fileno);
555
556 case EXEC_OUTPUT_TTY:
557 if (is_terminal_input(i))
558 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
559
560 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
561
562 case EXEC_OUTPUT_KMSG:
563 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
564 case EXEC_OUTPUT_JOURNAL:
565 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
566 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
567 if (r < 0) {
568 log_warning_errno(r, "Failed to connect %s to the journal socket, ignoring: %m",
569 fileno == STDOUT_FILENO ? "stdout" : "stderr");
570 r = open_null_as(O_WRONLY, fileno);
571 } else {
572 struct stat st;
573
574 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
575 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
576 * services to detect whether they are connected to the journal or not.
577 *
578 * If both stdout and stderr are connected to a stream then let's make sure to store the data
579 * about STDERR as that's usually the best way to do logging. */
580
581 if (fstat(fileno, &st) >= 0 &&
582 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
583 *journal_stream_dev = st.st_dev;
584 *journal_stream_ino = st.st_ino;
585 }
586 }
587 return r;
588
589 case EXEC_OUTPUT_SOCKET:
590 assert(socket_fd >= 0);
591
592 return RET_NERRNO(dup2(socket_fd, fileno));
593
594 case EXEC_OUTPUT_NAMED_FD:
595 assert(named_iofds[fileno] >= 0);
596
597 (void) fd_nonblock(named_iofds[fileno], false);
598 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
599
600 case EXEC_OUTPUT_FILE:
601 case EXEC_OUTPUT_FILE_APPEND:
602 case EXEC_OUTPUT_FILE_TRUNCATE: {
603 bool rw;
604 int fd, flags;
605
606 assert(context->stdio_file[fileno]);
607
608 rw = context->std_input == EXEC_INPUT_FILE &&
609 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
610
611 if (rw)
612 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
613
614 flags = O_WRONLY;
615 if (o == EXEC_OUTPUT_FILE_APPEND)
616 flags |= O_APPEND;
617 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
618 flags |= O_TRUNC;
619
620 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
621 if (fd < 0)
622 return fd;
623
624 return move_fd(fd, fileno, 0);
625 }
626
627 default:
628 assert_not_reached();
629 }
630 }
631
632 static int chown_terminal(int fd, uid_t uid) {
633 int r;
634
635 assert(fd >= 0);
636
637 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
638 if (!isatty_safe(fd))
639 return 0;
640
641 /* This might fail. What matters are the results. */
642 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
643 if (r < 0)
644 return r;
645
646 return 1;
647 }
648
649 static int setup_confirm_stdio(
650 const ExecContext *context,
651 const char *vc,
652 int *ret_saved_stdin,
653 int *ret_saved_stdout) {
654
655 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
656 int r;
657
658 assert(context);
659 assert(ret_saved_stdin);
660 assert(ret_saved_stdout);
661
662 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD_CLOEXEC, 3);
663 if (saved_stdin < 0)
664 return -errno;
665
666 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD_CLOEXEC, 3);
667 if (saved_stdout < 0)
668 return -errno;
669
670 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
671 if (fd < 0)
672 return fd;
673
674 _cleanup_close_ int lock_fd = lock_dev_console();
675 if (lock_fd < 0)
676 log_debug_errno(lock_fd, "Failed to lock /dev/console, ignoring: %m");
677
678 r = chown_terminal(fd, getuid());
679 if (r < 0)
680 return r;
681
682 r = terminal_reset_defensive(fd, TERMINAL_RESET_SWITCH_TO_TEXT);
683 if (r < 0)
684 return r;
685
686 r = exec_context_apply_tty_size(context, fd, fd, vc);
687 if (r < 0)
688 return r;
689
690 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
691 TAKE_FD(fd);
692 if (r < 0)
693 return r;
694
695 *ret_saved_stdin = TAKE_FD(saved_stdin);
696 *ret_saved_stdout = TAKE_FD(saved_stdout);
697 return 0;
698 }
699
700 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
701 assert(err != 0);
702 assert(fd >= 0);
703 assert(unit_id);
704
705 errno = abs(err);
706
707 if (errno == ETIMEDOUT)
708 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
709 else
710 dprintf(fd, "Couldn't ask confirmation for %s, assuming positive response: %m\n", unit_id);
711 }
712
713 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
714 _cleanup_close_ int fd = -EBADF;
715
716 assert(vc);
717
718 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
719 if (fd < 0)
720 return;
721
722 write_confirm_error_fd(err, fd, unit_id);
723 }
724
725 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
726 int r = 0;
727
728 assert(saved_stdin);
729 assert(saved_stdout);
730
731 release_terminal();
732
733 if (*saved_stdin >= 0)
734 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
735 r = -errno;
736
737 if (*saved_stdout >= 0)
738 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
739 r = -errno;
740
741 *saved_stdin = safe_close(*saved_stdin);
742 *saved_stdout = safe_close(*saved_stdout);
743
744 return r;
745 }
746
747 enum {
748 CONFIRM_PRETEND_FAILURE = -1,
749 CONFIRM_PRETEND_SUCCESS = 0,
750 CONFIRM_EXECUTE = 1,
751 };
752
753 static bool confirm_spawn_disabled(void) {
754 return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
755 }
756
757 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
758 int saved_stdout = -EBADF, saved_stdin = -EBADF, r;
759 _cleanup_free_ char *e = NULL;
760 char c;
761
762 assert(context);
763 assert(params);
764
765 /* For any internal errors, assume a positive response. */
766 r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
767 if (r < 0) {
768 write_confirm_error(r, params->confirm_spawn, params->unit_id);
769 return CONFIRM_EXECUTE;
770 }
771
772 /* confirm_spawn might have been disabled while we were sleeping. */
773 if (!params->confirm_spawn || confirm_spawn_disabled()) {
774 r = 1;
775 goto restore_stdio;
776 }
777
778 e = ellipsize(cmdline, 60, 100);
779 if (!e) {
780 log_oom();
781 r = CONFIRM_EXECUTE;
782 goto restore_stdio;
783 }
784
785 for (;;) {
786 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
787 if (r < 0) {
788 write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
789 r = CONFIRM_EXECUTE;
790 goto restore_stdio;
791 }
792
793 switch (c) {
794 case 'c':
795 printf("Resuming normal execution.\n");
796 manager_disable_confirm_spawn();
797 r = 1;
798 break;
799 case 'D':
800 printf(" Unit: %s\n",
801 params->unit_id);
802 exec_context_dump(context, stdout, " ");
803 exec_params_dump(params, stdout, " ");
804 continue; /* ask again */
805 case 'f':
806 printf("Failing execution.\n");
807 r = CONFIRM_PRETEND_FAILURE;
808 break;
809 case 'h':
810 printf(" c - continue, proceed without asking anymore\n"
811 " D - dump, show the state of the unit\n"
812 " f - fail, don't execute the command and pretend it failed\n"
813 " h - help\n"
814 " i - info, show a short summary of the unit\n"
815 " j - jobs, show jobs that are in progress\n"
816 " s - skip, don't execute the command and pretend it succeeded\n"
817 " y - yes, execute the command\n");
818 continue; /* ask again */
819 case 'i':
820 printf(" Unit: %s\n"
821 " Command: %s\n",
822 params->unit_id, cmdline);
823 continue; /* ask again */
824 case 'j':
825 if (sigqueue(getppid(),
826 SIGRTMIN+18,
827 (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
828 return -errno;
829
830 continue; /* ask again */
831 case 'n':
832 /* 'n' was removed in favor of 'f'. */
833 printf("Didn't understand 'n', did you mean 'f'?\n");
834 continue; /* ask again */
835 case 's':
836 printf("Skipping execution.\n");
837 r = CONFIRM_PRETEND_SUCCESS;
838 break;
839 case 'y':
840 r = CONFIRM_EXECUTE;
841 break;
842 default:
843 assert_not_reached();
844 }
845 break;
846 }
847
848 restore_stdio:
849 restore_confirm_stdio(&saved_stdin, &saved_stdout);
850 return r;
851 }
852
853 static int get_fixed_user(
854 const char *user_or_uid,
855 bool prefer_nss,
856 const char **ret_username,
857 uid_t *ret_uid,
858 gid_t *ret_gid,
859 const char **ret_home,
860 const char **ret_shell) {
861
862 int r;
863
864 assert(user_or_uid);
865 assert(ret_username);
866
867 r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell,
868 USER_CREDS_CLEAN|(prefer_nss ? USER_CREDS_PREFER_NSS : 0));
869 if (r < 0)
870 return r;
871
872 /* user_or_uid is normalized by get_user_creds to username */
873 *ret_username = user_or_uid;
874
875 return 0;
876 }
877
878 static int get_fixed_group(
879 const char *group_or_gid,
880 const char **ret_groupname,
881 gid_t *ret_gid) {
882
883 int r;
884
885 assert(group_or_gid);
886 assert(ret_groupname);
887
888 r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
889 if (r < 0)
890 return r;
891
892 /* group_or_gid is normalized by get_group_creds to groupname */
893 *ret_groupname = group_or_gid;
894
895 return 0;
896 }
897
898 static int get_supplementary_groups(
899 const ExecContext *c,
900 const char *user,
901 gid_t gid,
902 gid_t **ret_gids) {
903
904 int r;
905
906 assert(c);
907 assert(ret_gids);
908
909 /*
910 * If user is given, then lookup GID and supplementary groups list.
911 * We avoid NSS lookups for gid=0. Also we have to initialize groups
912 * here and as early as possible so we keep the list of supplementary
913 * groups of the caller.
914 */
915 bool keep_groups = false;
916 if (user && gid_is_valid(gid) && gid != 0) {
917 /* First step, initialize groups from /etc/groups */
918 if (initgroups(user, gid) < 0)
919 return -errno;
920
921 keep_groups = true;
922 }
923
924 if (strv_isempty(c->supplementary_groups)) {
925 *ret_gids = NULL;
926 return 0;
927 }
928
929 /*
930 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
931 * be positive, otherwise fail.
932 */
933 errno = 0;
934 int ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
935 if (ngroups_max <= 0)
936 return errno_or_else(EOPNOTSUPP);
937
938 _cleanup_free_ gid_t *l_gids = new(gid_t, ngroups_max);
939 if (!l_gids)
940 return -ENOMEM;
941
942 int k = 0;
943 if (keep_groups) {
944 /*
945 * Lookup the list of groups that the user belongs to, we
946 * avoid NSS lookups here too for gid=0.
947 */
948 k = ngroups_max;
949 if (getgrouplist(user, gid, l_gids, &k) < 0)
950 return -EINVAL;
951 }
952
953 STRV_FOREACH(i, c->supplementary_groups) {
954 if (k >= ngroups_max)
955 return -E2BIG;
956
957 const char *g = *i;
958 r = get_group_creds(&g, l_gids + k, /* flags = */ 0);
959 if (r < 0)
960 return r;
961
962 k++;
963 }
964
965 if (k == 0) {
966 *ret_gids = NULL;
967 return 0;
968 }
969
970 /* Otherwise get the final list of supplementary groups */
971 gid_t *groups = newdup(gid_t, l_gids, k);
972 if (!groups)
973 return -ENOMEM;
974
975 *ret_gids = groups;
976 return k;
977 }
978
979 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
980 int r;
981
982 /* Handle SupplementaryGroups= if it is not empty */
983 if (ngids > 0) {
984 r = maybe_setgroups(ngids, supplementary_gids);
985 if (r < 0)
986 return r;
987 }
988
989 if (gid_is_valid(gid)) {
990 /* Then set our gids */
991 if (setresgid(gid, gid, gid) < 0)
992 return -errno;
993 }
994
995 return 0;
996 }
997
998 static int set_securebits(unsigned bits, unsigned mask) {
999 unsigned applied;
1000 int current;
1001
1002 current = prctl(PR_GET_SECUREBITS);
1003 if (current < 0)
1004 return -errno;
1005
1006 /* Clear all securebits defined in mask and set bits */
1007 applied = ((unsigned) current & ~mask) | bits;
1008 if ((unsigned) current == applied)
1009 return 0;
1010
1011 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1012 return -errno;
1013
1014 return 1;
1015 }
1016
1017 static int enforce_user(
1018 const ExecContext *context,
1019 uid_t uid,
1020 uint64_t capability_ambient_set) {
1021
1022 int r;
1023
1024 assert(context);
1025
1026 if (!uid_is_valid(uid))
1027 return 0;
1028
1029 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1030 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1031 * case. */
1032
1033 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1034
1035 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1036 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1037 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1038 if (r < 0)
1039 return r;
1040 }
1041
1042 /* Second step: actually set the uids */
1043 if (setresuid(uid, uid, uid) < 0)
1044 return -errno;
1045
1046 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1047 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1048 * outside of this call. */
1049 return 0;
1050 }
1051
1052 #if HAVE_PAM
1053
1054 static void pam_response_free_array(struct pam_response *responses, size_t n_responses) {
1055 assert(responses || n_responses == 0);
1056
1057 FOREACH_ARRAY(resp, responses, n_responses)
1058 erase_and_free(resp->resp);
1059
1060 free(responses);
1061 }
1062
1063 typedef struct AskPasswordConvData {
1064 const ExecContext *context;
1065 const ExecParameters *params;
1066 } AskPasswordConvData;
1067
1068 static int ask_password_conv(
1069 int num_msg,
1070 const struct pam_message *msg[],
1071 struct pam_response **ret,
1072 void *userdata) {
1073
1074 AskPasswordConvData *data = ASSERT_PTR(userdata);
1075 bool set_credential_env_var = false;
1076 int r;
1077
1078 assert(num_msg >= 0);
1079 assert(msg);
1080 assert(data->context);
1081 assert(data->params);
1082
1083 size_t n = num_msg;
1084 struct pam_response *responses = new0(struct pam_response, n);
1085 if (!responses)
1086 return PAM_BUF_ERR;
1087 CLEANUP_ARRAY(responses, n, pam_response_free_array);
1088
1089 for (size_t i = 0; i < n; i++) {
1090 const struct pam_message *mi = *msg + i;
1091
1092 switch (mi->msg_style) {
1093
1094 case PAM_PROMPT_ECHO_ON:
1095 case PAM_PROMPT_ECHO_OFF: {
1096
1097 /* Locally set the $CREDENTIALS_DIRECTORY to the credentials directory we just populated */
1098 if (!set_credential_env_var) {
1099 _cleanup_free_ char *creds_dir = NULL;
1100 r = exec_context_get_credential_directory(data->context, data->params, data->params->unit_id, &creds_dir);
1101 if (r < 0)
1102 return log_error_errno(r, "Failed to determine credentials directory: %m");
1103
1104 if (creds_dir) {
1105 if (setenv("CREDENTIALS_DIRECTORY", creds_dir, /* overwrite= */ true) < 0)
1106 return log_error_errno(r, "Failed to set $CREDENTIALS_DIRECTORY: %m");
1107 } else
1108 (void) unsetenv("CREDENTIALS_DIRECTORY");
1109
1110 set_credential_env_var = true;
1111 }
1112
1113 _cleanup_free_ char *credential_name = strjoin("pam.authtok.", data->context->pam_name);
1114 if (!credential_name)
1115 return log_oom();
1116
1117 AskPasswordRequest req = {
1118 .message = mi->msg,
1119 .credential = credential_name,
1120 .tty_fd = -EBADF,
1121 .hup_fd = -EBADF,
1122 .until = usec_add(now(CLOCK_MONOTONIC), 15 * USEC_PER_SEC),
1123 };
1124
1125 _cleanup_strv_free_erase_ char **acquired = NULL;
1126 r = ask_password_auto(
1127 &req,
1128 ASK_PASSWORD_ACCEPT_CACHED|
1129 ASK_PASSWORD_NO_TTY|
1130 (mi->msg_style == PAM_PROMPT_ECHO_ON ? ASK_PASSWORD_ECHO : 0),
1131 &acquired);
1132 if (r < 0) {
1133 log_error_errno(r, "Failed to query for password: %m");
1134 return PAM_CONV_ERR;
1135 }
1136
1137 responses[i].resp = strdup(ASSERT_PTR(acquired[0]));
1138 if (!responses[i].resp) {
1139 log_oom();
1140 return PAM_BUF_ERR;
1141 }
1142 break;
1143 }
1144
1145 case PAM_ERROR_MSG:
1146 log_error("PAM: %s", mi->msg);
1147 break;
1148
1149 case PAM_TEXT_INFO:
1150 log_info("PAM: %s", mi->msg);
1151 break;
1152
1153 default:
1154 return PAM_CONV_ERR;
1155 }
1156 }
1157
1158 *ret = TAKE_PTR(responses);
1159 n = 0;
1160
1161 return PAM_SUCCESS;
1162 }
1163
1164 static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
1165 int r, s;
1166
1167 assert(handle);
1168
1169 r = pam_close_session(handle, flags);
1170 if (r != PAM_SUCCESS)
1171 log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
1172
1173 s = pam_setcred(handle, PAM_DELETE_CRED | flags);
1174 if (s != PAM_SUCCESS)
1175 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
1176
1177 return r != PAM_SUCCESS ? r : s;
1178 }
1179 #endif
1180
1181 static int attach_to_subcgroup(
1182 const ExecContext *context,
1183 const CGroupContext *cgroup_context,
1184 const ExecParameters *params,
1185 const char *prefix) {
1186
1187 _cleanup_free_ char *subgroup = NULL;
1188 int r;
1189
1190 assert(context);
1191 assert(cgroup_context);
1192 assert(params);
1193
1194 /* If we're a control process that needs a subgroup, we've already been spawned into it as otherwise
1195 * we'd violate the "no inner processes" rule, so no need to do anything. */
1196 if (exec_params_needs_control_subcgroup(params))
1197 return 0;
1198
1199 r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
1200 if (r < 0)
1201 return log_error_errno(r, "Failed to acquire cgroup path: %m");
1202 /* No subgroup required? Then there's nothing to do. */
1203 if (r == 0)
1204 return 0;
1205
1206 r = cg_attach(subgroup, 0);
1207 if (r == -EUCLEAN)
1208 return log_error_errno(r,
1209 "Failed to attach process " PID_FMT " to cgroup '%s', "
1210 "because the cgroup or one of its parents or "
1211 "siblings is in the threaded mode.",
1212 getpid_cached(), subgroup);
1213 if (r < 0)
1214 return log_error_errno(r,
1215 "Failed to attach process " PID_FMT " to cgroup %s: %m",
1216 getpid_cached(), subgroup);
1217
1218 return 0;
1219 }
1220
1221 static int setup_pam(
1222 const ExecContext *context,
1223 const CGroupContext *cgroup_context,
1224 ExecParameters *params,
1225 const char *user,
1226 uid_t uid,
1227 gid_t gid,
1228 char ***env, /* updated on success */
1229 const int fds[], size_t n_fds,
1230 bool needs_sandboxing,
1231 int exec_fd) {
1232
1233 #if HAVE_PAM
1234 AskPasswordConvData conv_data = {
1235 .context = context,
1236 .params = params,
1237 };
1238
1239 const struct pam_conv conv = {
1240 .conv = ask_password_conv,
1241 .appdata_ptr = &conv_data,
1242 };
1243
1244 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1245 _cleanup_strv_free_ char **e = NULL;
1246 _cleanup_free_ char *tty = NULL;
1247 pam_handle_t *handle = NULL;
1248 sigset_t old_ss;
1249 int pam_code = PAM_SUCCESS, r;
1250 bool close_session = false;
1251 pid_t parent_pid;
1252 int flags = 0;
1253
1254 assert(context);
1255 assert(params);
1256 assert(user);
1257 assert(uid_is_valid(uid));
1258 assert(gid_is_valid(gid));
1259 assert(fds || n_fds == 0);
1260 assert(env);
1261
1262 /* We set up PAM in the parent process, then fork. The child
1263 * will then stay around until killed via PR_GET_PDEATHSIG or
1264 * systemd via the cgroup logic. It will then remove the PAM
1265 * session again. The parent process will exec() the actual
1266 * daemon. We do things this way to ensure that the main PID
1267 * of the daemon is the one we initially fork()ed. */
1268
1269 r = barrier_create(&barrier);
1270 if (r < 0)
1271 goto fail;
1272
1273 if (log_get_max_level() < LOG_DEBUG)
1274 flags |= PAM_SILENT;
1275
1276 pam_code = pam_start(context->pam_name, user, &conv, &handle);
1277 if (pam_code != PAM_SUCCESS) {
1278 handle = NULL;
1279 goto fail;
1280 }
1281
1282 if (getttyname_malloc(STDIN_FILENO, &tty) >= 0) {
1283 _cleanup_free_ char *q = path_join("/dev", tty);
1284 if (!q) {
1285 r = -ENOMEM;
1286 goto fail;
1287 }
1288
1289 free_and_replace(tty, q);
1290 }
1291
1292 if (tty) {
1293 pam_code = pam_set_item(handle, PAM_TTY, tty);
1294 if (pam_code != PAM_SUCCESS)
1295 goto fail;
1296 }
1297
1298 STRV_FOREACH(nv, *env) {
1299 pam_code = pam_putenv(handle, *nv);
1300 if (pam_code != PAM_SUCCESS)
1301 goto fail;
1302 }
1303
1304 pam_code = pam_acct_mgmt(handle, flags);
1305 if (pam_code != PAM_SUCCESS)
1306 goto fail;
1307
1308 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1309 if (pam_code != PAM_SUCCESS)
1310 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
1311
1312 pam_code = pam_open_session(handle, flags);
1313 if (pam_code != PAM_SUCCESS)
1314 goto fail;
1315
1316 close_session = true;
1317
1318 e = pam_getenvlist(handle);
1319 if (!e) {
1320 pam_code = PAM_BUF_ERR;
1321 goto fail;
1322 }
1323
1324 /* Block SIGTERM, so that we know that it won't get lost in the child */
1325
1326 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
1327
1328 parent_pid = getpid_cached();
1329
1330 r = safe_fork("(sd-pam)", 0, NULL);
1331 if (r < 0)
1332 goto fail;
1333 if (r == 0) {
1334 int ret = EXIT_PAM;
1335
1336 if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
1337 /* Move PAM process into subgroup immediately if the main process hasn't been moved
1338 * into the subgroup yet (when cgroup namespacing is enabled) and a subgroup is
1339 * configured. */
1340 r = attach_to_subcgroup(context, cgroup_context, params, params->cgroup_path);
1341 if (r < 0)
1342 return r;
1343 }
1344
1345 /* The child's job is to reset the PAM session on termination */
1346 barrier_set_role(&barrier, BARRIER_CHILD);
1347
1348 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1349 * those fds are open here that have been opened by PAM. */
1350 (void) close_many(fds, n_fds);
1351
1352 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1353 * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1354 * we'd never signal completion. */
1355 exec_fd = safe_close(exec_fd);
1356
1357 /* Drop privileges - we don't need any to pam_close_session and this will make
1358 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1359 * threads to fail to exit normally */
1360
1361 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1362 if (r < 0)
1363 log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1364
1365 (void) ignore_signals(SIGPIPE);
1366
1367 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1368 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1369 * this way. We rely on the control groups kill logic to do the rest for us. */
1370 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1371 goto child_finish;
1372
1373 /* Tell the parent that our setup is done. This is especially important regarding dropping
1374 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1375 *
1376 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1377 (void) barrier_place(&barrier);
1378
1379 /* Check if our parent process might already have died? */
1380 if (getppid() == parent_pid) {
1381 sigset_t ss;
1382 int sig;
1383
1384 assert_se(sigemptyset(&ss) >= 0);
1385 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1386
1387 assert_se(sigwait(&ss, &sig) == 0);
1388 assert(sig == SIGTERM);
1389 }
1390
1391 /* If our parent died we'll end the session */
1392 if (getppid() != parent_pid) {
1393 pam_code = pam_close_session_and_delete_credentials(handle, flags);
1394 if (pam_code != PAM_SUCCESS)
1395 goto child_finish;
1396 }
1397
1398 ret = 0;
1399
1400 child_finish:
1401 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1402 * know about this. See pam_end(3) */
1403 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1404 _exit(ret);
1405 }
1406
1407 barrier_set_role(&barrier, BARRIER_PARENT);
1408
1409 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1410 * here. */
1411 handle = NULL;
1412
1413 /* Unblock SIGTERM again in the parent */
1414 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1415
1416 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1417 * this fd around. */
1418 closelog();
1419
1420 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1421 * recover. However, warn loudly if it happens. */
1422 if (!barrier_place_and_sync(&barrier))
1423 log_error("PAM initialization failed");
1424
1425 return strv_free_and_replace(*env, e);
1426
1427 fail:
1428 if (pam_code != PAM_SUCCESS) {
1429 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1430 r = -EPERM; /* PAM errors do not map to errno */
1431 } else
1432 log_error_errno(r, "PAM failed: %m");
1433
1434 if (handle) {
1435 if (close_session)
1436 pam_code = pam_close_session_and_delete_credentials(handle, flags);
1437
1438 (void) pam_end(handle, pam_code | flags);
1439 }
1440
1441 closelog();
1442 return r;
1443 #else
1444 return 0;
1445 #endif
1446 }
1447
1448 static void rename_process_from_path(const char *path) {
1449 _cleanup_free_ char *buf = NULL;
1450 const char *p;
1451
1452 assert(path);
1453
1454 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1455 * /bin/ps */
1456
1457 if (path_extract_filename(path, &buf) < 0) {
1458 rename_process("(...)");
1459 return;
1460 }
1461
1462 size_t l = strlen(buf);
1463 if (l > 8) {
1464 /* The end of the process name is usually more interesting, since the first bit might just be
1465 * "systemd-" */
1466 p = buf + l - 8;
1467 l = 8;
1468 } else
1469 p = buf;
1470
1471 char process_name[11];
1472 process_name[0] = '(';
1473 memcpy(process_name+1, p, l);
1474 process_name[1+l] = ')';
1475 process_name[1+l+1] = 0;
1476
1477 (void) rename_process(process_name);
1478 }
1479
1480 static bool context_has_address_families(const ExecContext *c) {
1481 assert(c);
1482
1483 return c->address_families_allow_list ||
1484 !set_isempty(c->address_families);
1485 }
1486
1487 static bool context_has_syscall_filters(const ExecContext *c) {
1488 assert(c);
1489
1490 return c->syscall_allow_list ||
1491 !hashmap_isempty(c->syscall_filter);
1492 }
1493
1494 static bool context_has_syscall_logs(const ExecContext *c) {
1495 assert(c);
1496
1497 return c->syscall_log_allow_list ||
1498 !hashmap_isempty(c->syscall_log);
1499 }
1500
1501 static bool context_has_seccomp(const ExecContext *c) {
1502 assert(c);
1503
1504 /* We need NNP if we have any form of seccomp and are unprivileged */
1505 return c->lock_personality ||
1506 c->memory_deny_write_execute ||
1507 c->private_devices ||
1508 c->protect_clock ||
1509 c->protect_hostname == PROTECT_HOSTNAME_YES ||
1510 c->protect_kernel_tunables ||
1511 c->protect_kernel_modules ||
1512 c->protect_kernel_logs ||
1513 context_has_address_families(c) ||
1514 exec_context_restrict_namespaces_set(c) ||
1515 c->restrict_realtime ||
1516 c->restrict_suid_sgid ||
1517 !set_isempty(c->syscall_archs) ||
1518 context_has_syscall_filters(c) ||
1519 context_has_syscall_logs(c);
1520 }
1521
1522 static bool context_has_no_new_privileges(const ExecContext *c) {
1523 assert(c);
1524
1525 if (c->no_new_privileges)
1526 return true;
1527
1528 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1529 return false;
1530
1531 return context_has_seccomp(c);
1532 }
1533
1534 #if HAVE_SECCOMP
1535
1536 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1537 void *id, *val;
1538 bool have_capget = false, have_capset = false, have_prctl = false;
1539
1540 assert(c);
1541
1542 /* No syscall filter, we are allowed to drop privileges */
1543 if (hashmap_isempty(c->syscall_filter))
1544 return true;
1545
1546 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1547 _cleanup_free_ char *name = NULL;
1548
1549 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1550
1551 if (streq(name, "capget"))
1552 have_capget = true;
1553 else if (streq(name, "capset"))
1554 have_capset = true;
1555 else if (streq(name, "prctl"))
1556 have_prctl = true;
1557 }
1558
1559 if (c->syscall_allow_list)
1560 return have_capget && have_capset && have_prctl;
1561 else
1562 return !(have_capget || have_capset || have_prctl);
1563 }
1564
1565 static bool skip_seccomp_unavailable(const char *msg) {
1566 assert(msg);
1567
1568 if (is_seccomp_available())
1569 return false;
1570
1571 log_debug("SECCOMP features not detected in the kernel, skipping %s", msg);
1572 return true;
1573 }
1574
1575 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p) {
1576 uint32_t negative_action, default_action, action;
1577 int r;
1578
1579 assert(c);
1580 assert(p);
1581
1582 if (!context_has_syscall_filters(c))
1583 return 0;
1584
1585 if (skip_seccomp_unavailable("SystemCallFilter="))
1586 return 0;
1587
1588 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1589
1590 if (c->syscall_allow_list) {
1591 default_action = negative_action;
1592 action = SCMP_ACT_ALLOW;
1593 } else {
1594 default_action = SCMP_ACT_ALLOW;
1595 action = negative_action;
1596 }
1597
1598 /* Sending over exec_fd or handoff_timestamp_fd requires write() syscall. */
1599 if (p->exec_fd >= 0 || p->handoff_timestamp_fd >= 0) {
1600 r = seccomp_filter_set_add_by_name(c->syscall_filter, c->syscall_allow_list, "write");
1601 if (r < 0)
1602 return r;
1603 }
1604
1605 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1606 }
1607
1608 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1609 #ifdef SCMP_ACT_LOG
1610 uint32_t default_action, action;
1611 #endif
1612
1613 assert(c);
1614 assert(p);
1615
1616 if (!context_has_syscall_logs(c))
1617 return 0;
1618
1619 #ifdef SCMP_ACT_LOG
1620 if (skip_seccomp_unavailable("SystemCallLog="))
1621 return 0;
1622
1623 if (c->syscall_log_allow_list) {
1624 /* Log nothing but the ones listed */
1625 default_action = SCMP_ACT_ALLOW;
1626 action = SCMP_ACT_LOG;
1627 } else {
1628 /* Log everything but the ones listed */
1629 default_action = SCMP_ACT_LOG;
1630 action = SCMP_ACT_ALLOW;
1631 }
1632
1633 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1634 #else
1635 /* old libseccomp */
1636 log_debug( "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1637 return 0;
1638 #endif
1639 }
1640
1641 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1642 assert(c);
1643 assert(p);
1644
1645 if (set_isempty(c->syscall_archs))
1646 return 0;
1647
1648 if (skip_seccomp_unavailable("SystemCallArchitectures="))
1649 return 0;
1650
1651 return seccomp_restrict_archs(c->syscall_archs);
1652 }
1653
1654 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1655 assert(c);
1656 assert(p);
1657
1658 if (!context_has_address_families(c))
1659 return 0;
1660
1661 if (skip_seccomp_unavailable("RestrictAddressFamilies="))
1662 return 0;
1663
1664 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1665 }
1666
1667 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1668 int r;
1669
1670 assert(c);
1671 assert(p);
1672
1673 if (!c->memory_deny_write_execute)
1674 return 0;
1675
1676 /* use prctl() if kernel supports it (6.3) */
1677 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1678 if (r == 0) {
1679 log_debug("Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1680 return 0;
1681 }
1682 if (r < 0 && errno != EINVAL)
1683 return log_debug_errno(errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1684 /* else use seccomp */
1685 log_debug("Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1686
1687 if (skip_seccomp_unavailable("MemoryDenyWriteExecute="))
1688 return 0;
1689
1690 return seccomp_memory_deny_write_execute();
1691 }
1692
1693 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1694 assert(c);
1695 assert(p);
1696
1697 if (!c->restrict_realtime)
1698 return 0;
1699
1700 if (skip_seccomp_unavailable("RestrictRealtime="))
1701 return 0;
1702
1703 return seccomp_restrict_realtime();
1704 }
1705
1706 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1707 assert(c);
1708 assert(p);
1709
1710 if (!c->restrict_suid_sgid)
1711 return 0;
1712
1713 if (skip_seccomp_unavailable("RestrictSUIDSGID="))
1714 return 0;
1715
1716 return seccomp_restrict_suid_sgid();
1717 }
1718
1719 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1720 assert(c);
1721 assert(p);
1722
1723 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1724 * let's protect even those systems where this is left on in the kernel. */
1725
1726 if (!c->protect_kernel_tunables)
1727 return 0;
1728
1729 if (skip_seccomp_unavailable("ProtectKernelTunables="))
1730 return 0;
1731
1732 return seccomp_protect_sysctl();
1733 }
1734
1735 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1736 assert(c);
1737 assert(p);
1738
1739 /* Turn off module syscalls on ProtectKernelModules=yes */
1740
1741 if (!c->protect_kernel_modules)
1742 return 0;
1743
1744 if (skip_seccomp_unavailable("ProtectKernelModules="))
1745 return 0;
1746
1747 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1748 }
1749
1750 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1751 assert(c);
1752 assert(p);
1753
1754 if (!c->protect_kernel_logs)
1755 return 0;
1756
1757 if (skip_seccomp_unavailable("ProtectKernelLogs="))
1758 return 0;
1759
1760 return seccomp_protect_syslog();
1761 }
1762
1763 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1764 assert(c);
1765 assert(p);
1766
1767 if (!c->protect_clock)
1768 return 0;
1769
1770 if (skip_seccomp_unavailable("ProtectClock="))
1771 return 0;
1772
1773 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1774 }
1775
1776 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1777 assert(c);
1778 assert(p);
1779
1780 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1781
1782 if (!c->private_devices)
1783 return 0;
1784
1785 if (skip_seccomp_unavailable("PrivateDevices="))
1786 return 0;
1787
1788 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1789 }
1790
1791 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1792 assert(c);
1793 assert(p);
1794
1795 if (!exec_context_restrict_namespaces_set(c))
1796 return 0;
1797
1798 if (skip_seccomp_unavailable("RestrictNamespaces="))
1799 return 0;
1800
1801 return seccomp_restrict_namespaces(c->restrict_namespaces);
1802 }
1803
1804 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1805 unsigned long personality;
1806 int r;
1807
1808 assert(c);
1809 assert(p);
1810
1811 if (!c->lock_personality)
1812 return 0;
1813
1814 if (skip_seccomp_unavailable("LockPersonality="))
1815 return 0;
1816
1817 personality = c->personality;
1818
1819 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1820 if (personality == PERSONALITY_INVALID) {
1821
1822 r = opinionated_personality(&personality);
1823 if (r < 0)
1824 return r;
1825 }
1826
1827 return seccomp_lock_personality(personality);
1828 }
1829
1830 #endif
1831
1832 #if HAVE_LIBBPF
1833 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1834 int r;
1835
1836 assert(c);
1837 assert(p);
1838
1839 if (!exec_context_restrict_filesystems_set(c))
1840 return 0;
1841
1842 if (p->bpf_restrict_fs_map_fd < 0) {
1843 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1844 log_debug("LSM BPF not supported, skipping RestrictFileSystems=");
1845 return 0;
1846 }
1847
1848 /* We are in a new binary, so dl-open again */
1849 r = dlopen_bpf();
1850 if (r < 0)
1851 return r;
1852
1853 return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
1854 }
1855 #endif
1856
1857 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1858 int r;
1859
1860 assert(c);
1861 assert(p);
1862 assert(ret_exit_status);
1863
1864 if (c->protect_hostname == PROTECT_HOSTNAME_NO)
1865 return 0;
1866
1867 if (namespace_type_supported(NAMESPACE_UTS)) {
1868 if (unshare(CLONE_NEWUTS) < 0) {
1869 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1870 *ret_exit_status = EXIT_NAMESPACE;
1871 return log_error_errno(errno, "Failed to set up UTS namespacing: %m");
1872 }
1873
1874 log_warning("ProtectHostname=%s is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.",
1875 protect_hostname_to_string(c->protect_hostname));
1876
1877 } else if (c->private_hostname) {
1878 r = sethostname_idempotent(c->private_hostname);
1879 if (r < 0) {
1880 *ret_exit_status = EXIT_NAMESPACE;
1881 return log_error_errno(r, "Failed to set private hostname '%s': %m", c->private_hostname);
1882 }
1883 }
1884 } else
1885 log_warning("ProtectHostname=%s is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.",
1886 protect_hostname_to_string(c->protect_hostname));
1887
1888 #if HAVE_SECCOMP
1889 if (c->protect_hostname == PROTECT_HOSTNAME_YES) {
1890 if (skip_seccomp_unavailable("ProtectHostname="))
1891 return 0;
1892
1893 r = seccomp_protect_hostname();
1894 if (r < 0) {
1895 *ret_exit_status = EXIT_SECCOMP;
1896 return log_error_errno(r, "Failed to apply hostname restrictions: %m");
1897 }
1898 }
1899 #endif
1900
1901 return 1;
1902 }
1903
1904 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1905 assert(idle_pipe);
1906
1907 idle_pipe[1] = safe_close(idle_pipe[1]);
1908 idle_pipe[2] = safe_close(idle_pipe[2]);
1909
1910 if (idle_pipe[0] >= 0) {
1911 int r;
1912
1913 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1914
1915 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1916 ssize_t n;
1917
1918 /* Signal systemd that we are bored and want to continue. */
1919 n = write(idle_pipe[3], "x", 1);
1920 if (n > 0)
1921 /* Wait for systemd to react to the signal above. */
1922 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1923 }
1924
1925 idle_pipe[0] = safe_close(idle_pipe[0]);
1926
1927 }
1928
1929 idle_pipe[3] = safe_close(idle_pipe[3]);
1930 }
1931
1932 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1933
1934 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1935 * the service payload in. */
1936 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1937 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1938 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1939 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1940 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1941 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1942 };
1943
1944 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1945
1946 static int build_environment(
1947 const ExecContext *c,
1948 const ExecParameters *p,
1949 const CGroupContext *cgroup_context,
1950 size_t n_fds,
1951 const char *home,
1952 const char *username,
1953 const char *shell,
1954 dev_t journal_stream_dev,
1955 ino_t journal_stream_ino,
1956 const char *memory_pressure_path,
1957 bool needs_sandboxing,
1958 char ***ret) {
1959
1960 _cleanup_strv_free_ char **our_env = NULL;
1961 size_t n_env = 0;
1962 char *x;
1963 int r;
1964
1965 assert(c);
1966 assert(p);
1967 assert(cgroup_context);
1968 assert(ret);
1969
1970 #define N_ENV_VARS 19
1971 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX + 1);
1972 if (!our_env)
1973 return -ENOMEM;
1974
1975 if (n_fds > 0) {
1976 _cleanup_free_ char *joined = NULL;
1977
1978 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1979 return -ENOMEM;
1980 our_env[n_env++] = x;
1981
1982 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1983 return -ENOMEM;
1984 our_env[n_env++] = x;
1985
1986 joined = strv_join(p->fd_names, ":");
1987 if (!joined)
1988 return -ENOMEM;
1989
1990 x = strjoin("LISTEN_FDNAMES=", joined);
1991 if (!x)
1992 return -ENOMEM;
1993 our_env[n_env++] = x;
1994 }
1995
1996 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1997 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1998 return -ENOMEM;
1999 our_env[n_env++] = x;
2000
2001 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
2002 return -ENOMEM;
2003 our_env[n_env++] = x;
2004 }
2005
2006 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
2007 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
2008 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
2009 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
2010 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
2011 if (!x)
2012 return -ENOMEM;
2013 our_env[n_env++] = x;
2014 }
2015
2016 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
2017 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
2018 * really make much sense since we're not logged in. Hence we conditionalize the three based on
2019 * SetLoginEnvironment= switch. */
2020 if (!username && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2021 assert(!c->user);
2022
2023 r = get_fixed_user("root", /* prefer_nss = */ false, &username, NULL, NULL, &home, &shell);
2024 if (r < 0)
2025 return log_debug_errno(r, "Failed to determine user credentials for root: %m");
2026 }
2027
2028 bool set_user_login_env = exec_context_get_set_login_environment(c);
2029
2030 if (username) {
2031 x = strjoin("USER=", username);
2032 if (!x)
2033 return -ENOMEM;
2034 our_env[n_env++] = x;
2035
2036 if (set_user_login_env) {
2037 x = strjoin("LOGNAME=", username);
2038 if (!x)
2039 return -ENOMEM;
2040 our_env[n_env++] = x;
2041 }
2042 }
2043
2044 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
2045 * (i.e. are "/" or "/bin/nologin"). */
2046
2047 if (home && set_user_login_env && !empty_or_root(home)) {
2048 x = strjoin("HOME=", home);
2049 if (!x)
2050 return -ENOMEM;
2051
2052 path_simplify(x + 5);
2053 our_env[n_env++] = x;
2054 }
2055
2056 if (shell && set_user_login_env && !shell_is_placeholder(shell)) {
2057 x = strjoin("SHELL=", shell);
2058 if (!x)
2059 return -ENOMEM;
2060
2061 path_simplify(x + 6);
2062 our_env[n_env++] = x;
2063 }
2064
2065 if (!sd_id128_is_null(p->invocation_id)) {
2066 assert(p->invocation_id_string);
2067
2068 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
2069 if (!x)
2070 return -ENOMEM;
2071
2072 our_env[n_env++] = x;
2073 }
2074
2075 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2076 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2077 return -ENOMEM;
2078
2079 our_env[n_env++] = x;
2080 }
2081
2082 if (c->log_namespace) {
2083 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2084 if (!x)
2085 return -ENOMEM;
2086
2087 our_env[n_env++] = x;
2088 }
2089
2090 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2091 _cleanup_free_ char *joined = NULL;
2092 const char *n;
2093
2094 if (!p->prefix[t])
2095 continue;
2096
2097 if (c->directories[t].n_items == 0)
2098 continue;
2099
2100 n = exec_directory_env_name_to_string(t);
2101 if (!n)
2102 continue;
2103
2104 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2105 _cleanup_free_ char *prefixed = NULL;
2106
2107 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2108 if (!prefixed)
2109 return -ENOMEM;
2110
2111 if (!strextend_with_separator(&joined, ":", prefixed))
2112 return -ENOMEM;
2113 }
2114
2115 x = strjoin(n, "=", joined);
2116 if (!x)
2117 return -ENOMEM;
2118
2119 our_env[n_env++] = x;
2120 }
2121
2122 _cleanup_free_ char *creds_dir = NULL;
2123 r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
2124 if (r < 0)
2125 return r;
2126 if (r > 0) {
2127 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2128 if (!x)
2129 return -ENOMEM;
2130
2131 our_env[n_env++] = x;
2132 }
2133
2134 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2135 return -ENOMEM;
2136
2137 our_env[n_env++] = x;
2138
2139 if (memory_pressure_path) {
2140 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2141 if (!x)
2142 return -ENOMEM;
2143
2144 our_env[n_env++] = x;
2145
2146 if (!path_equal(memory_pressure_path, "/dev/null")) {
2147 _cleanup_free_ char *b = NULL, *e = NULL;
2148
2149 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2150 MEMORY_PRESSURE_DEFAULT_TYPE,
2151 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2152 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2153 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2154 return -ENOMEM;
2155
2156 if (base64mem(b, strlen(b) + 1, &e) < 0)
2157 return -ENOMEM;
2158
2159 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2160 if (!x)
2161 return -ENOMEM;
2162
2163 our_env[n_env++] = x;
2164 }
2165 }
2166
2167 if (p->notify_socket) {
2168 x = strjoin("NOTIFY_SOCKET=", exec_get_private_notify_socket_path(c, p, needs_sandboxing) ?: p->notify_socket);
2169 if (!x)
2170 return -ENOMEM;
2171
2172 our_env[n_env++] = x;
2173 }
2174
2175 assert(c->private_var_tmp >= 0 && c->private_var_tmp < _PRIVATE_TMP_MAX);
2176 if (needs_sandboxing && c->private_tmp != c->private_var_tmp) {
2177 assert(c->private_tmp == PRIVATE_TMP_DISCONNECTED);
2178 assert(c->private_var_tmp == PRIVATE_TMP_NO);
2179
2180 /* When private tmpfs is enabled only on /tmp/, then explicitly set $TMPDIR to suggest the
2181 * service to use /tmp/. */
2182
2183 x = strdup("TMPDIR=/tmp");
2184 if (!x)
2185 return -ENOMEM;
2186
2187 our_env[n_env++] = x;
2188 }
2189
2190 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2191 #undef N_ENV_VARS
2192
2193 *ret = TAKE_PTR(our_env);
2194
2195 return 0;
2196 }
2197
2198 static int build_pass_environment(const ExecContext *c, char ***ret) {
2199 _cleanup_strv_free_ char **pass_env = NULL;
2200 size_t n_env = 0;
2201
2202 assert(c);
2203 assert(ret);
2204
2205 STRV_FOREACH(i, c->pass_environment) {
2206 _cleanup_free_ char *x = NULL;
2207 char *v;
2208
2209 v = getenv(*i);
2210 if (!v)
2211 continue;
2212 x = strjoin(*i, "=", v);
2213 if (!x)
2214 return -ENOMEM;
2215
2216 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2217 return -ENOMEM;
2218
2219 pass_env[n_env++] = TAKE_PTR(x);
2220 pass_env[n_env] = NULL;
2221 }
2222
2223 *ret = TAKE_PTR(pass_env);
2224 return 0;
2225 }
2226
2227 static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
2228 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2229 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2230 _cleanup_close_ int unshare_ready_fd = -EBADF;
2231 _cleanup_(sigkill_waitp) pid_t pid = 0;
2232 uint64_t c = 1;
2233 ssize_t n;
2234 int r;
2235
2236 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2237 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2238 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2239 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2240 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2241 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2242 * continues execution normally.
2243 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2244 * does not need CAP_SETUID to write the single line mapping to itself. */
2245
2246 if (private_users == PRIVATE_USERS_NO)
2247 return 0;
2248
2249 if (private_users == PRIVATE_USERS_IDENTITY) {
2250 uid_map = strdup("0 0 65536\n");
2251 if (!uid_map)
2252 return -ENOMEM;
2253 } else if (private_users == PRIVATE_USERS_FULL) {
2254 /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because
2255 * this is the same UID/GID map as the init user namespace and systemd's running_in_userns()
2256 * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`.
2257 * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user
2258 * namespace from the init namespace:
2259 * 0 0 1
2260 * 1 1 UINT32_MAX - 1
2261 *
2262 * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258
2263 * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep
2264 * this uid_map/gid_map hack until version 259 for version N-1 compatibility.
2265 *
2266 * TODO: Switch to `0 0 UINT32_MAX` in systemd v259.
2267 *
2268 * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though
2269 * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is
2270 * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to
2271 * use these UIDs/GIDs so we need to map them. */
2272 r = asprintf(&uid_map, "0 0 1\n"
2273 "1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1));
2274 if (r < 0)
2275 return -ENOMEM;
2276 /* Can only set up multiple mappings with CAP_SETUID. */
2277 } else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) {
2278 r = asprintf(&uid_map,
2279 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2280 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2281 ouid, ouid, uid, uid);
2282 if (r < 0)
2283 return -ENOMEM;
2284 } else {
2285 r = asprintf(&uid_map,
2286 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2287 ouid, ouid);
2288 if (r < 0)
2289 return -ENOMEM;
2290 }
2291
2292 if (private_users == PRIVATE_USERS_IDENTITY) {
2293 gid_map = strdup("0 0 65536\n");
2294 if (!gid_map)
2295 return -ENOMEM;
2296 } else if (private_users == PRIVATE_USERS_FULL) {
2297 r = asprintf(&gid_map, "0 0 1\n"
2298 "1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1));
2299 if (r < 0)
2300 return -ENOMEM;
2301 /* Can only set up multiple mappings with CAP_SETGID. */
2302 } else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) {
2303 r = asprintf(&gid_map,
2304 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2305 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2306 ogid, ogid, gid, gid);
2307 if (r < 0)
2308 return -ENOMEM;
2309 } else {
2310 r = asprintf(&gid_map,
2311 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2312 ogid, ogid);
2313 if (r < 0)
2314 return -ENOMEM;
2315 }
2316
2317 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2318 * namespace. */
2319 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2320 if (unshare_ready_fd < 0)
2321 return -errno;
2322
2323 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2324 * failed. */
2325 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2326 return -errno;
2327
2328 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2329 if (r < 0)
2330 return r;
2331 if (r == 0) {
2332 _cleanup_close_ int fd = -EBADF;
2333 const char *a;
2334 pid_t ppid;
2335
2336 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2337 * here, after the parent opened its own user namespace. */
2338
2339 ppid = getppid();
2340 errno_pipe[0] = safe_close(errno_pipe[0]);
2341
2342 /* Wait until the parent unshared the user namespace */
2343 if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
2344 report_errno_and_exit(errno_pipe[1], -errno);
2345
2346 /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
2347 * and using the system service manager. */
2348 a = procfs_file_alloca(ppid, "setgroups");
2349 fd = open(a, O_WRONLY|O_CLOEXEC);
2350 if (fd < 0) {
2351 if (errno != ENOENT) {
2352 r = log_debug_errno(errno, "Failed to open %s: %m", a);
2353 report_errno_and_exit(errno_pipe[1], r);
2354 }
2355
2356 /* If the file is missing the kernel is too old, let's continue anyway. */
2357 } else {
2358 const char *setgroups = allow_setgroups ? "allow\n" : "deny\n";
2359 if (write(fd, setgroups, strlen(setgroups)) < 0) {
2360 r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a);
2361 report_errno_and_exit(errno_pipe[1], r);
2362 }
2363
2364 fd = safe_close(fd);
2365 }
2366
2367 /* First write the GID map */
2368 a = procfs_file_alloca(ppid, "gid_map");
2369 fd = open(a, O_WRONLY|O_CLOEXEC);
2370 if (fd < 0) {
2371 r = log_debug_errno(errno, "Failed to open %s: %m", a);
2372 report_errno_and_exit(errno_pipe[1], r);
2373 }
2374
2375 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2376 r = log_debug_errno(errno, "Failed to write GID map to %s: %m", a);
2377 report_errno_and_exit(errno_pipe[1], r);
2378 }
2379
2380 fd = safe_close(fd);
2381
2382 /* The write the UID map */
2383 a = procfs_file_alloca(ppid, "uid_map");
2384 fd = open(a, O_WRONLY|O_CLOEXEC);
2385 if (fd < 0) {
2386 r = log_debug_errno(errno, "Failed to open %s: %m", a);
2387 report_errno_and_exit(errno_pipe[1], r);
2388 }
2389
2390 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2391 r = log_debug_errno(errno, "Failed to write UID map to %s: %m", a);
2392 report_errno_and_exit(errno_pipe[1], r);
2393 }
2394
2395 _exit(EXIT_SUCCESS);
2396 }
2397
2398 errno_pipe[1] = safe_close(errno_pipe[1]);
2399
2400 if (unshare(CLONE_NEWUSER) < 0)
2401 return log_debug_errno(errno, "Failed to unshare user namespace: %m");
2402
2403 /* Let the child know that the namespace is ready now */
2404 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2405 return -errno;
2406
2407 /* Try to read an error code from the child */
2408 n = read(errno_pipe[0], &r, sizeof(r));
2409 if (n < 0)
2410 return -errno;
2411 if (n == sizeof(r)) { /* an error code was sent to us */
2412 if (r < 0)
2413 return r;
2414 return -EIO;
2415 }
2416 if (n != 0) /* on success we should have read 0 bytes */
2417 return -EIO;
2418
2419 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2420 if (r < 0)
2421 return r;
2422 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2423 return -EIO;
2424
2425 return 1;
2426 }
2427
2428 static int can_mount_proc(void) {
2429 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2430 _cleanup_(sigkill_waitp) pid_t pid = 0;
2431 ssize_t n;
2432 int r;
2433
2434 /* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs
2435 * like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction
2436 * where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */
2437
2438 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2439 * failed. */
2440 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2441 return log_debug_errno(errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m");
2442
2443 /* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE
2444 * with FORK_MOUNTNS_SLAVE. */
2445 r = safe_fork("(sd-proc-check)",
2446 FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid);
2447 if (r < 0)
2448 return log_debug_errno(r, "Failed to fork child process (sd-proc-check): %m");
2449 if (r == 0) {
2450 errno_pipe[0] = safe_close(errno_pipe[0]);
2451
2452 /* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount
2453 * namespace will be cleaned up once the process exits. */
2454 r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2455 if (r < 0) {
2456 (void) write(errno_pipe[1], &r, sizeof(r));
2457 _exit(EXIT_FAILURE);
2458 }
2459
2460 _exit(EXIT_SUCCESS);
2461 }
2462
2463 errno_pipe[1] = safe_close(errno_pipe[1]);
2464
2465 /* Try to read an error code from the child */
2466 n = read(errno_pipe[0], &r, sizeof(r));
2467 if (n < 0)
2468 return log_debug_errno(errno, "Failed to read errno from pipe with child process (sd-proc-check): %m");
2469 if (n == sizeof(r)) { /* an error code was sent to us */
2470 /* This is the expected case where proc cannot be mounted due to permissions. */
2471 if (ERRNO_IS_NEG_PRIVILEGE(r))
2472 return 0;
2473 if (r < 0)
2474 return r;
2475
2476 return -EIO;
2477 }
2478 if (n != 0) /* on success we should have read 0 bytes */
2479 return -EIO;
2480
2481 r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */);
2482 if (r < 0)
2483 return log_debug_errno(r, "Failed to wait for (sd-proc-check) child process to terminate: %m");
2484 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2485 return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r);
2486
2487 return 1;
2488 }
2489
2490 static int setup_private_pids(const ExecContext *c, ExecParameters *p) {
2491 _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
2492 _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2493 ssize_t n;
2494 int r, q;
2495
2496 assert(c);
2497 assert(p);
2498 assert(p->pidref_transport_fd >= 0);
2499
2500 /* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so
2501 * we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child
2502 * pidref to the manager and exits while the child process continues with the rest of exec_invoke()
2503 * and finally executes the actual payload. */
2504
2505 /* Create a communication channel so that the parent can tell the child a proper error code in case it
2506 * failed to send child pidref to the manager. */
2507 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2508 return log_debug_errno(errno, "Failed to create pipe for communicating with parent process: %m");
2509
2510 /* Set FORK_DETACH to immediately re-parent the child process to the invoking manager process. */
2511 r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS|FORK_DETACH, &pidref);
2512 if (r < 0)
2513 return log_debug_errno(r, "Failed to fork child into new pid namespace: %m");
2514 if (r > 0) {
2515 errno_pipe[0] = safe_close(errno_pipe[0]);
2516
2517 /* In the parent process, we send the child pidref to the manager and exit.
2518 * If PIDFD is not supported, only the child PID is sent. The server then
2519 * uses the child PID to set the new exec main process. */
2520 q = send_one_fd_iov(
2521 p->pidref_transport_fd,
2522 pidref.fd,
2523 &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
2524 /*iovlen=*/ 1,
2525 /*flags=*/ 0);
2526 /* Send error code to child process. */
2527 (void) write(errno_pipe[1], &q, sizeof(q));
2528 /* Exit here so we only go through the destructors in exec_invoke only once - in the child - as
2529 * some destructors have external effects. The main codepaths continue in the child process. */
2530 _exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
2531 }
2532
2533 errno_pipe[1] = safe_close(errno_pipe[1]);
2534 p->pidref_transport_fd = safe_close(p->pidref_transport_fd);
2535
2536 /* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always
2537 * receive an errno even on success. */
2538 n = read(errno_pipe[0], &r, sizeof(r));
2539 if (n < 0)
2540 return log_debug_errno(errno, "Failed to read errno from pipe with parent process: %m");
2541 if (n != sizeof(r))
2542 return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process");
2543 if (r < 0)
2544 return log_debug_errno(r, "Failed to send child pidref to manager: %m");
2545
2546 /* NOTE! This function returns in the child process only. */
2547 return r;
2548 }
2549
2550 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2551 _cleanup_free_ char *src_abs = NULL;
2552 int r;
2553
2554 assert(source);
2555
2556 src_abs = path_join(root, source);
2557 if (!src_abs)
2558 return -ENOMEM;
2559
2560 STRV_FOREACH(dst, symlinks) {
2561 _cleanup_free_ char *dst_abs = NULL;
2562
2563 dst_abs = path_join(root, *dst);
2564 if (!dst_abs)
2565 return -ENOMEM;
2566
2567 r = mkdir_parents_label(dst_abs, 0755);
2568 if (r < 0)
2569 return r;
2570
2571 r = symlink_idempotent(src_abs, dst_abs, true);
2572 if (r < 0)
2573 return r;
2574 }
2575
2576 return 0;
2577 }
2578
2579 static int setup_exec_directory(
2580 const ExecContext *context,
2581 const ExecParameters *params,
2582 uid_t uid,
2583 gid_t gid,
2584 ExecDirectoryType type,
2585 bool needs_mount_namespace,
2586 int *exit_status) {
2587
2588 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2589 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2590 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2591 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2592 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2593 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2594 };
2595 int r;
2596
2597 assert(context);
2598 assert(params);
2599 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2600 assert(exit_status);
2601
2602 if (!params->prefix[type])
2603 return 0;
2604
2605 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2606 if (!uid_is_valid(uid))
2607 uid = 0;
2608 if (!gid_is_valid(gid))
2609 gid = 0;
2610 }
2611
2612 FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
2613 _cleanup_free_ char *p = NULL, *pp = NULL;
2614
2615 p = path_join(params->prefix[type], i->path);
2616 if (!p) {
2617 r = -ENOMEM;
2618 goto fail;
2619 }
2620
2621 r = mkdir_parents_label(p, 0755);
2622 if (r < 0)
2623 goto fail;
2624
2625 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2626
2627 /* If we are in user mode, and a configuration directory exists but a state directory
2628 * doesn't exist, then we likely are upgrading from an older systemd version that
2629 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2630 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2631 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now
2632 * separated. If a service has both dirs configured but only the configuration dir
2633 * exists and the state dir does not, we assume we are looking at an update
2634 * situation. Hence, create a compatibility symlink, so that all expectations are
2635 * met.
2636 *
2637 * (We also do something similar with the log directory, which still doesn't exist in
2638 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2639
2640 /* this assumes the state dir is always created before the configuration dir */
2641 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2642 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2643
2644 r = access_nofollow(p, F_OK);
2645 if (r == -ENOENT) {
2646 _cleanup_free_ char *q = NULL;
2647
2648 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2649 * under the configuration hierarchy. */
2650
2651 if (type == EXEC_DIRECTORY_STATE)
2652 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], i->path);
2653 else if (type == EXEC_DIRECTORY_LOGS)
2654 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", i->path);
2655 else
2656 assert_not_reached();
2657 if (!q) {
2658 r = -ENOMEM;
2659 goto fail;
2660 }
2661
2662 r = access_nofollow(q, F_OK);
2663 if (r >= 0) {
2664 /* It does exist! This hence looks like an update. Symlink the
2665 * configuration directory into the state directory. */
2666
2667 r = symlink_idempotent(q, p, /* make_relative= */ true);
2668 if (r < 0)
2669 goto fail;
2670
2671 log_notice("Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2672 continue;
2673 } else if (r != -ENOENT)
2674 log_warning_errno(r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2675
2676 } else if (r < 0)
2677 log_warning_errno(r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2678 }
2679
2680 if (exec_directory_is_private(context, type)) {
2681 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2682 * case we want to avoid leaving a directory around fully accessible that is owned by
2683 * a dynamic user whose UID is later on reused. To lock this down we use the same
2684 * trick used by container managers to prohibit host users to get access to files of
2685 * the same UID in containers: we place everything inside a directory that has an
2686 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2687 * for unprivileged host code. We then use fs namespacing to make this directory
2688 * permeable for the service itself.
2689 *
2690 * Specifically: for a service which wants a special directory "foo/" we first create
2691 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2692 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2693 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2694 * unprivileged host users can't look into it. Inside of the namespace of the unit
2695 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2696 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2697 * for the service and making sure it only gets access to the dirs it needs but no
2698 * others. Tricky? Yes, absolutely, but it works!
2699 *
2700 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2701 * to be owned by the service itself.
2702 *
2703 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2704 * for sharing files or sockets with other services. */
2705
2706 pp = path_join(params->prefix[type], "private");
2707 if (!pp) {
2708 r = -ENOMEM;
2709 goto fail;
2710 }
2711
2712 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2713 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2714 if (r < 0)
2715 goto fail;
2716
2717 if (!path_extend(&pp, i->path)) {
2718 r = -ENOMEM;
2719 goto fail;
2720 }
2721
2722 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2723 r = mkdir_parents_label(pp, 0755);
2724 if (r < 0)
2725 goto fail;
2726
2727 if (is_dir(p, false) > 0 &&
2728 (access_nofollow(pp, F_OK) == -ENOENT)) {
2729
2730 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2731 * it over. Most likely the service has been upgraded from one that didn't use
2732 * DynamicUser=1, to one that does. */
2733
2734 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2735 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2736 exec_directory_type_to_string(type), p, pp);
2737
2738 r = RET_NERRNO(rename(p, pp));
2739 if (r < 0)
2740 goto fail;
2741 } else {
2742 /* Otherwise, create the actual directory for the service */
2743
2744 r = mkdir_label(pp, context->directories[type].mode);
2745 if (r < 0 && r != -EEXIST)
2746 goto fail;
2747 }
2748
2749 if (!FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE)) {
2750 /* And link it up from the original place.
2751 * Notes
2752 * 1) If a mount namespace is going to be used, then this symlink remains on
2753 * the host, and a new one for the child namespace will be created later.
2754 * 2) It is not necessary to create this symlink when one of its parent
2755 * directories is specified and already created. E.g.
2756 * StateDirectory=foo foo/bar
2757 * In that case, the inode points to pp and p for "foo/bar" are the same:
2758 * pp = "/var/lib/private/foo/bar"
2759 * p = "/var/lib/foo/bar"
2760 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2761 * we do not need to create the symlink, but we cannot create the symlink.
2762 * See issue #24783. */
2763 r = symlink_idempotent(pp, p, true);
2764 if (r < 0)
2765 goto fail;
2766 }
2767
2768 } else {
2769 _cleanup_free_ char *target = NULL;
2770
2771 if (EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type) &&
2772 readlink_and_make_absolute(p, &target) >= 0) {
2773 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2774
2775 /* This already exists and is a symlink? Interesting. Maybe it's one created
2776 * by DynamicUser=1 (see above)?
2777 *
2778 * We do this for all directory types except for ConfigurationDirectory=,
2779 * since they all support the private/ symlink logic at least in some
2780 * configurations, see above. */
2781
2782 r = chase(target, NULL, 0, &target_resolved, NULL);
2783 if (r < 0)
2784 goto fail;
2785
2786 q = path_join(params->prefix[type], "private", i->path);
2787 if (!q) {
2788 r = -ENOMEM;
2789 goto fail;
2790 }
2791
2792 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2793 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2794 if (r < 0)
2795 goto fail;
2796
2797 if (path_equal(q_resolved, target_resolved)) {
2798
2799 /* Hmm, apparently DynamicUser= was once turned on for this service,
2800 * but is no longer. Let's move the directory back up. */
2801
2802 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2803 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2804 exec_directory_type_to_string(type), q, p);
2805
2806 r = RET_NERRNO(unlink(p));
2807 if (r < 0)
2808 goto fail;
2809
2810 r = RET_NERRNO(rename(q, p));
2811 if (r < 0)
2812 goto fail;
2813 }
2814 }
2815
2816 r = mkdir_label(p, context->directories[type].mode);
2817 if (r < 0) {
2818 if (r != -EEXIST)
2819 goto fail;
2820
2821 if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type)) {
2822 struct stat st;
2823
2824 /* Don't change the owner/access mode of the configuration directory,
2825 * as in the common case it is not written to by a service, and shall
2826 * not be writable. */
2827
2828 r = RET_NERRNO(stat(p, &st));
2829 if (r < 0)
2830 goto fail;
2831
2832 /* Still complain if the access mode doesn't match */
2833 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2834 log_warning("%s \'%s\' already exists but the mode is different. "
2835 "(File system: %o %sMode: %o)",
2836 exec_directory_type_to_string(type), i->path,
2837 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2838
2839 continue;
2840 }
2841 }
2842 }
2843
2844 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2845 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2846 * current UID/GID ownership.) */
2847 const char *target_dir = pp ?: p;
2848 r = chmod_and_chown(target_dir, context->directories[type].mode, UID_INVALID, GID_INVALID);
2849 if (r < 0)
2850 goto fail;
2851
2852 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2853 * available to user code anyway */
2854 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2855 continue;
2856
2857 int idmapping_supported = is_idmapping_supported(target_dir);
2858 if (idmapping_supported < 0) {
2859 r = log_debug_errno(idmapping_supported, "Unable to determine if ID mapping is supported on mount '%s': %m", target_dir);
2860 goto fail;
2861 }
2862
2863 log_debug("ID-mapping is%ssupported for exec directory %s", idmapping_supported ? " " : " not ", target_dir);
2864
2865 /* Change the ownership of the whole tree, if necessary. When dynamic users are used we
2866 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2867 * assignments to exist. */
2868 uid_t chown_uid = uid;
2869 gid_t chown_gid = gid;
2870 bool do_chown = false;
2871
2872 if (uid == 0 || gid == 0 || !idmapping_supported) {
2873 do_chown = true;
2874 i->idmapped = false;
2875 } else {
2876 /* Use 'nobody' uid/gid for exec directories if ID-mapping is supported. For backward compatibility,
2877 * continue doing chmod/chown if the directory was chmod/chowned before (if uid/gid is not 'nobody') */
2878 struct stat st;
2879 r = RET_NERRNO(stat(target_dir, &st));
2880 if (r < 0)
2881 goto fail;
2882
2883 if (st.st_uid == UID_NOBODY && st.st_gid == GID_NOBODY) {
2884 do_chown = false;
2885 i->idmapped = true;
2886 } else if (exec_directory_is_private(context, type) && st.st_uid == 0 && st.st_gid == 0) {
2887 chown_uid = UID_NOBODY;
2888 chown_gid = GID_NOBODY;
2889 do_chown = true;
2890 i->idmapped = true;
2891 } else {
2892 do_chown = true;
2893 i->idmapped = false;
2894 }
2895 }
2896
2897 if (do_chown) {
2898 r = path_chown_recursive(target_dir, chown_uid, chown_gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2899 if (r < 0)
2900 goto fail;
2901 }
2902 }
2903
2904 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2905 * they are set up later, to allow configuring empty var/run/etc. */
2906 if (!needs_mount_namespace)
2907 FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
2908 r = create_many_symlinks(params->prefix[type], i->path, i->symlinks);
2909 if (r < 0)
2910 goto fail;
2911 }
2912
2913 return 0;
2914
2915 fail:
2916 *exit_status = exit_status_table[type];
2917 return r;
2918 }
2919
2920 #if ENABLE_SMACK
2921 static int setup_smack(
2922 const ExecContext *context,
2923 const ExecParameters *params,
2924 int executable_fd) {
2925 int r;
2926
2927 assert(context);
2928 assert(params);
2929 assert(executable_fd >= 0);
2930
2931 if (context->smack_process_label) {
2932 r = mac_smack_apply_pid(0, context->smack_process_label);
2933 if (r < 0)
2934 return r;
2935 } else if (params->fallback_smack_process_label) {
2936 _cleanup_free_ char *exec_label = NULL;
2937
2938 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2939 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2940 return r;
2941
2942 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2943 if (r < 0)
2944 return r;
2945 }
2946
2947 return 0;
2948 }
2949 #endif
2950
2951 static int compile_bind_mounts(
2952 const ExecContext *context,
2953 const ExecParameters *params,
2954 uid_t exec_directory_uid, /* only used for id-mapped mounts Exec directories */
2955 gid_t exec_directory_gid, /* only used for id-mapped mounts Exec directories */
2956 BindMount **ret_bind_mounts,
2957 size_t *ret_n_bind_mounts,
2958 char ***ret_empty_directories) {
2959
2960 _cleanup_strv_free_ char **empty_directories = NULL;
2961 BindMount *bind_mounts = NULL;
2962 size_t n, h = 0;
2963 int r;
2964
2965 assert(context);
2966 assert(params);
2967 assert(ret_bind_mounts);
2968 assert(ret_n_bind_mounts);
2969 assert(ret_empty_directories);
2970
2971 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2972
2973 n = context->n_bind_mounts;
2974 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2975 if (!params->prefix[t])
2976 continue;
2977
2978 FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items)
2979 n += !FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE) || FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY);
2980 }
2981
2982 if (n <= 0) {
2983 *ret_bind_mounts = NULL;
2984 *ret_n_bind_mounts = 0;
2985 *ret_empty_directories = NULL;
2986 return 0;
2987 }
2988
2989 bind_mounts = new(BindMount, n);
2990 if (!bind_mounts)
2991 return -ENOMEM;
2992
2993 FOREACH_ARRAY(item, context->bind_mounts, context->n_bind_mounts) {
2994 r = bind_mount_add(&bind_mounts, &h, item);
2995 if (r < 0)
2996 return r;
2997 }
2998
2999 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3000 if (!params->prefix[t])
3001 continue;
3002
3003 if (context->directories[t].n_items == 0)
3004 continue;
3005
3006 if (exec_directory_is_private(context, t) &&
3007 !exec_context_with_rootfs(context)) {
3008 char *private_root;
3009
3010 /* So this is for a dynamic user, and we need to make sure the process can access its own
3011 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3012 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3013
3014 private_root = path_join(params->prefix[t], "private");
3015 if (!private_root)
3016 return -ENOMEM;
3017
3018 r = strv_consume(&empty_directories, private_root);
3019 if (r < 0)
3020 return r;
3021 }
3022
3023 FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items) {
3024 _cleanup_free_ char *s = NULL, *d = NULL;
3025
3026 /* When one of the parent directories is in the list, we cannot create the symlink
3027 * for the child directory. See also the comments in setup_exec_directory().
3028 * But if it needs to be read only, then we have to create a bind mount anyway to
3029 * make it so. */
3030 if (FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE) && !FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY))
3031 continue;
3032
3033 if (exec_directory_is_private(context, t))
3034 s = path_join(params->prefix[t], "private", i->path);
3035 else
3036 s = path_join(params->prefix[t], i->path);
3037 if (!s)
3038 return -ENOMEM;
3039
3040 if (exec_directory_is_private(context, t) &&
3041 exec_context_with_rootfs(context))
3042 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3043 * directory is not created on the root directory. So, let's bind-mount the directory
3044 * on the 'non-private' place. */
3045 d = path_join(params->prefix[t], i->path);
3046 else
3047 d = strdup(s);
3048 if (!d)
3049 return -ENOMEM;
3050
3051 bind_mounts[h++] = (BindMount) {
3052 .source = TAKE_PTR(s),
3053 .destination = TAKE_PTR(d),
3054 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3055 .recursive = true,
3056 .read_only = FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY),
3057 .idmapped = i->idmapped,
3058 .uid = exec_directory_uid,
3059 .gid = exec_directory_gid,
3060 };
3061 }
3062 }
3063
3064 assert(h == n);
3065
3066 *ret_bind_mounts = TAKE_PTR(bind_mounts);
3067 *ret_n_bind_mounts = n;
3068 *ret_empty_directories = TAKE_PTR(empty_directories);
3069
3070 return (int) n;
3071 }
3072
3073 /* ret_symlinks will contain a list of pairs src:dest that describes
3074 * the symlinks to create later on. For example, the symlinks needed
3075 * to safely give private directories to DynamicUser=1 users. */
3076 static int compile_symlinks(
3077 const ExecContext *context,
3078 const ExecParameters *params,
3079 bool setup_os_release_symlink,
3080 char ***ret_symlinks) {
3081
3082 _cleanup_strv_free_ char **symlinks = NULL;
3083 int r;
3084
3085 assert(context);
3086 assert(params);
3087 assert(ret_symlinks);
3088
3089 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
3090 FOREACH_ARRAY(i, context->directories[dt].items, context->directories[dt].n_items) {
3091 _cleanup_free_ char *private_path = NULL, *path = NULL;
3092
3093 STRV_FOREACH(symlink, i->symlinks) {
3094 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3095
3096 src_abs = path_join(params->prefix[dt], i->path);
3097 dst_abs = path_join(params->prefix[dt], *symlink);
3098 if (!src_abs || !dst_abs)
3099 return -ENOMEM;
3100
3101 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3102 if (r < 0)
3103 return r;
3104 }
3105
3106 if (!exec_directory_is_private(context, dt) ||
3107 exec_context_with_rootfs(context) ||
3108 FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE))
3109 continue;
3110
3111 private_path = path_join(params->prefix[dt], "private", i->path);
3112 if (!private_path)
3113 return -ENOMEM;
3114
3115 path = path_join(params->prefix[dt], i->path);
3116 if (!path)
3117 return -ENOMEM;
3118
3119 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3120 if (r < 0)
3121 return r;
3122 }
3123
3124 /* We make the host's os-release available via a symlink, so that we can copy it atomically
3125 * and readers will never get a half-written version. Note that, while the paths specified here are
3126 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
3127 * 'os-release -> .os-release-stage/os-release' is what will be created. */
3128 if (setup_os_release_symlink) {
3129 r = strv_extend_many(
3130 &symlinks,
3131 "/run/host/.os-release-stage/os-release",
3132 "/run/host/os-release");
3133 if (r < 0)
3134 return r;
3135 }
3136
3137 *ret_symlinks = TAKE_PTR(symlinks);
3138
3139 return 0;
3140 }
3141
3142 static bool insist_on_sandboxing(
3143 const ExecContext *context,
3144 const char *root_dir,
3145 const char *root_image,
3146 const BindMount *bind_mounts,
3147 size_t n_bind_mounts) {
3148
3149 assert(context);
3150 assert(n_bind_mounts == 0 || bind_mounts);
3151
3152 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3153 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3154 * rearrange stuff in a way we cannot ignore gracefully. */
3155
3156 if (context->n_temporary_filesystems > 0)
3157 return true;
3158
3159 if (root_dir || root_image)
3160 return true;
3161
3162 if (context->n_mount_images > 0)
3163 return true;
3164
3165 if (context->dynamic_user)
3166 return true;
3167
3168 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3169 return true;
3170
3171 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3172 * essential. */
3173 FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
3174 if (!path_equal(i->source, i->destination))
3175 return true;
3176
3177 if (context->log_namespace)
3178 return true;
3179
3180 return false;
3181 }
3182
3183 static int setup_ephemeral(
3184 const ExecContext *context,
3185 ExecRuntime *runtime,
3186 char **root_image, /* both input and output! modified if ephemeral logic enabled */
3187 char **root_directory, /* ditto */
3188 char **reterr_path) {
3189
3190 _cleanup_close_ int fd = -EBADF;
3191 _cleanup_free_ char *new_root = NULL;
3192 int r;
3193
3194 assert(context);
3195 assert(runtime);
3196 assert(root_image);
3197 assert(root_directory);
3198
3199 if (!*root_image && !*root_directory)
3200 return 0;
3201
3202 if (!runtime->ephemeral_copy)
3203 return 0;
3204
3205 assert(runtime->ephemeral_storage_socket[0] >= 0);
3206 assert(runtime->ephemeral_storage_socket[1] >= 0);
3207
3208 new_root = strdup(runtime->ephemeral_copy);
3209 if (!new_root)
3210 return log_oom_debug();
3211
3212 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3213 if (r < 0)
3214 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3215
3216 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3217
3218 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3219 if (fd >= 0)
3220 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3221 return 0;
3222 if (fd != -EAGAIN)
3223 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3224
3225 if (*root_image) {
3226 log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
3227
3228 fd = copy_file(*root_image, new_root, O_EXCL, 0600,
3229 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME|COPY_NOCOW_AFTER);
3230 if (fd < 0) {
3231 *reterr_path = strdup(*root_image);
3232 return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
3233 *root_image, new_root);
3234 }
3235 } else {
3236 assert(*root_directory);
3237
3238 log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
3239
3240 fd = btrfs_subvol_snapshot_at(
3241 AT_FDCWD, *root_directory,
3242 AT_FDCWD, new_root,
3243 BTRFS_SNAPSHOT_FALLBACK_COPY |
3244 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3245 BTRFS_SNAPSHOT_RECURSIVE |
3246 BTRFS_SNAPSHOT_LOCK_BSD);
3247 if (fd < 0) {
3248 *reterr_path = strdup(*root_directory);
3249 return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
3250 *root_directory, new_root);
3251 }
3252 }
3253
3254 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3255 if (r < 0)
3256 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3257
3258 if (*root_image)
3259 free_and_replace(*root_image, new_root);
3260 else {
3261 assert(*root_directory);
3262 free_and_replace(*root_directory, new_root);
3263 }
3264
3265 return 1;
3266 }
3267
3268 static int verity_settings_prepare(
3269 VeritySettings *verity,
3270 const char *root_image,
3271 const void *root_hash,
3272 size_t root_hash_size,
3273 const char *root_hash_path,
3274 const void *root_hash_sig,
3275 size_t root_hash_sig_size,
3276 const char *root_hash_sig_path,
3277 const char *verity_data_path) {
3278
3279 int r;
3280
3281 assert(verity);
3282
3283 if (root_hash) {
3284 void *d;
3285
3286 d = memdup(root_hash, root_hash_size);
3287 if (!d)
3288 return -ENOMEM;
3289
3290 free_and_replace(verity->root_hash, d);
3291 verity->root_hash_size = root_hash_size;
3292 verity->designator = PARTITION_ROOT;
3293 }
3294
3295 if (root_hash_sig) {
3296 void *d;
3297
3298 d = memdup(root_hash_sig, root_hash_sig_size);
3299 if (!d)
3300 return -ENOMEM;
3301
3302 free_and_replace(verity->root_hash_sig, d);
3303 verity->root_hash_sig_size = root_hash_sig_size;
3304 verity->designator = PARTITION_ROOT;
3305 }
3306
3307 if (verity_data_path) {
3308 r = free_and_strdup(&verity->data_path, verity_data_path);
3309 if (r < 0)
3310 return r;
3311 }
3312
3313 r = verity_settings_load(
3314 verity,
3315 root_image,
3316 root_hash_path,
3317 root_hash_sig_path);
3318 if (r < 0)
3319 return log_debug_errno(r, "Failed to load root hash: %m");
3320
3321 return 0;
3322 }
3323
3324 static int pick_versions(
3325 const ExecContext *context,
3326 const ExecParameters *params,
3327 char **ret_root_image,
3328 char **ret_root_directory,
3329 char **reterr_path) {
3330
3331 int r;
3332
3333 assert(context);
3334 assert(params);
3335 assert(ret_root_image);
3336 assert(ret_root_directory);
3337
3338 if (context->root_image) {
3339 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3340
3341 r = path_pick(/* toplevel_path= */ NULL,
3342 /* toplevel_fd= */ AT_FDCWD,
3343 context->root_image,
3344 &pick_filter_image_raw,
3345 PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3346 &result);
3347 if (r < 0) {
3348 *reterr_path = strdup(context->root_image);
3349 return r;
3350 }
3351
3352 if (!result.path) {
3353 *reterr_path = strdup(context->root_image);
3354 return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
3355 }
3356
3357 *ret_root_image = TAKE_PTR(result.path);
3358 *ret_root_directory = NULL;
3359 return r;
3360 }
3361
3362 if (context->root_directory) {
3363 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3364
3365 r = path_pick(/* toplevel_path= */ NULL,
3366 /* toplevel_fd= */ AT_FDCWD,
3367 context->root_directory,
3368 &pick_filter_image_dir,
3369 PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3370 &result);
3371 if (r < 0) {
3372 *reterr_path = strdup(context->root_directory);
3373 return r;
3374 }
3375
3376 if (!result.path) {
3377 *reterr_path = strdup(context->root_directory);
3378 return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
3379 }
3380
3381 *ret_root_image = NULL;
3382 *ret_root_directory = TAKE_PTR(result.path);
3383 return r;
3384 }
3385
3386 *ret_root_image = *ret_root_directory = NULL;
3387 return 0;
3388 }
3389
3390 static int apply_mount_namespace(
3391 ExecCommandFlags command_flags,
3392 const ExecContext *context,
3393 const ExecParameters *params,
3394 ExecRuntime *runtime,
3395 const char *memory_pressure_path,
3396 bool needs_sandboxing,
3397 char **reterr_path,
3398 uid_t exec_directory_uid,
3399 gid_t exec_directory_gid) {
3400
3401 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3402 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3403 **read_write_paths_cleanup = NULL;
3404 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3405 *private_namespace_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
3406 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3407 char **read_write_paths;
3408 bool setup_os_release_symlink;
3409 BindMount *bind_mounts = NULL;
3410 size_t n_bind_mounts = 0;
3411 int r;
3412
3413 assert(context);
3414 assert(params);
3415 assert(runtime);
3416
3417 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3418
3419 if (params->flags & EXEC_APPLY_CHROOT) {
3420 r = pick_versions(
3421 context,
3422 params,
3423 &root_image,
3424 &root_dir,
3425 reterr_path);
3426 if (r < 0)
3427 return r;
3428
3429 r = setup_ephemeral(
3430 context,
3431 runtime,
3432 &root_image,
3433 &root_dir,
3434 reterr_path);
3435 if (r < 0)
3436 return r;
3437 }
3438
3439 r = compile_bind_mounts(context, params, exec_directory_uid, exec_directory_gid, &bind_mounts, &n_bind_mounts, &empty_directories);
3440 if (r < 0)
3441 return r;
3442
3443 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3444 * service will need to write to it in order to start the notifications. */
3445 if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3446 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3447 if (!read_write_paths_cleanup)
3448 return -ENOMEM;
3449
3450 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3451 if (r < 0)
3452 return r;
3453
3454 read_write_paths = read_write_paths_cleanup;
3455 } else
3456 read_write_paths = context->read_write_paths;
3457
3458 if (needs_sandboxing) {
3459 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3460 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3461 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
3462
3463 if (context->private_tmp == PRIVATE_TMP_CONNECTED && runtime->shared) {
3464 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3465 tmp_dir = runtime->shared->tmp_dir;
3466 else if (runtime->shared->tmp_dir)
3467 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3468
3469 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3470 var_tmp_dir = runtime->shared->var_tmp_dir;
3471 else if (runtime->shared->var_tmp_dir)
3472 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3473 }
3474 }
3475
3476 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3477 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3478 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3479 if (r < 0)
3480 return r;
3481
3482 if (context->mount_propagation_flag == MS_SHARED)
3483 log_debug("shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3484
3485 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3486 if (r < 0)
3487 return r;
3488
3489 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3490 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3491 if (!propagate_dir)
3492 return -ENOMEM;
3493
3494 incoming_dir = strdup("/run/systemd/incoming");
3495 if (!incoming_dir)
3496 return -ENOMEM;
3497
3498 private_namespace_dir = strdup("/run/systemd");
3499 if (!private_namespace_dir)
3500 return -ENOMEM;
3501
3502 /* If running under a different root filesystem, propagate the host's os-release. We make a
3503 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3504 if (setup_os_release_symlink) {
3505 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3506 if (!host_os_release_stage)
3507 return -ENOMEM;
3508 }
3509 } else {
3510 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3511
3512 if (asprintf(&private_namespace_dir, "/run/user/" UID_FMT "/systemd", geteuid()) < 0)
3513 return -ENOMEM;
3514
3515 if (setup_os_release_symlink) {
3516 if (asprintf(&host_os_release_stage,
3517 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3518 geteuid()) < 0)
3519 return -ENOMEM;
3520 }
3521 }
3522
3523 if (root_image) {
3524 r = verity_settings_prepare(
3525 &verity,
3526 root_image,
3527 context->root_hash, context->root_hash_size, context->root_hash_path,
3528 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3529 context->root_verity);
3530 if (r < 0)
3531 return r;
3532 }
3533
3534 NamespaceParameters parameters = {
3535 .runtime_scope = params->runtime_scope,
3536
3537 .root_directory = root_dir,
3538 .root_image = root_image,
3539 .root_image_options = context->root_image_options,
3540 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3541
3542 .read_write_paths = read_write_paths,
3543 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3544 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3545
3546 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3547 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3548
3549 .empty_directories = empty_directories,
3550 .symlinks = symlinks,
3551
3552 .bind_mounts = bind_mounts,
3553 .n_bind_mounts = n_bind_mounts,
3554
3555 .temporary_filesystems = context->temporary_filesystems,
3556 .n_temporary_filesystems = context->n_temporary_filesystems,
3557
3558 .mount_images = context->mount_images,
3559 .n_mount_images = context->n_mount_images,
3560 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3561
3562 .tmp_dir = tmp_dir,
3563 .var_tmp_dir = var_tmp_dir,
3564
3565 .creds_path = creds_path,
3566 .log_namespace = context->log_namespace,
3567 .mount_propagation_flag = context->mount_propagation_flag,
3568
3569 .verity = &verity,
3570
3571 .extension_images = context->extension_images,
3572 .n_extension_images = context->n_extension_images,
3573 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3574 .extension_directories = context->extension_directories,
3575
3576 .propagate_dir = propagate_dir,
3577 .incoming_dir = incoming_dir,
3578 .private_namespace_dir = private_namespace_dir,
3579 .host_notify_socket = params->notify_socket,
3580 .notify_socket_path = exec_get_private_notify_socket_path(context, params, needs_sandboxing),
3581 .host_os_release_stage = host_os_release_stage,
3582
3583 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3584 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3585 * sandbox inside the mount namespace. */
3586 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3587
3588 .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
3589 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3590 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3591 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3592
3593 .private_dev = needs_sandboxing && context->private_devices,
3594 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3595 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3596 .private_pids = needs_sandboxing && exec_needs_pid_namespace(context, params) ? context->private_pids : PRIVATE_PIDS_NO,
3597 .private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,
3598 .private_var_tmp = needs_sandboxing ? context->private_var_tmp : PRIVATE_TMP_NO,
3599
3600 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3601 .bind_log_sockets = needs_sandboxing && exec_context_get_effective_bind_log_sockets(context),
3602
3603 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3604 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3605
3606 .protect_home = needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
3607 .protect_hostname = needs_sandboxing ? context->protect_hostname : PROTECT_HOSTNAME_NO,
3608 .protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
3609 .protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
3610 .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
3611 };
3612
3613 r = setup_namespace(&parameters, reterr_path);
3614 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3615 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3616 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3617 * completely different execution environment. */
3618 if (r == -ENOANO) {
3619 if (insist_on_sandboxing(
3620 context,
3621 root_dir, root_image,
3622 bind_mounts,
3623 n_bind_mounts))
3624 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3625 "Failed to set up namespace, and refusing to continue since "
3626 "the selected namespacing options alter mount environment non-trivially.\n"
3627 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3628 n_bind_mounts,
3629 context->n_temporary_filesystems,
3630 yes_no(root_dir),
3631 yes_no(root_image),
3632 yes_no(context->dynamic_user));
3633
3634 log_debug("Failed to set up namespace, assuming containerized execution and ignoring.");
3635 return 0;
3636 }
3637
3638 return r;
3639 }
3640
3641 static int apply_working_directory(
3642 const ExecContext *context,
3643 const ExecParameters *params,
3644 ExecRuntime *runtime,
3645 const char *pwent_home,
3646 char * const *env) {
3647
3648 const char *wd;
3649 int r;
3650
3651 assert(context);
3652 assert(params);
3653 assert(runtime);
3654
3655 if (context->working_directory_home) {
3656 /* Preferably use the data from $HOME, in case it was updated by a PAM module */
3657 wd = strv_env_get(env, "HOME");
3658 if (!wd) {
3659 /* If that's not available, use the data from the struct passwd entry: */
3660 if (!pwent_home)
3661 return -ENXIO;
3662
3663 wd = pwent_home;
3664 }
3665 } else
3666 wd = empty_to_root(context->working_directory);
3667
3668 if (params->flags & EXEC_APPLY_CHROOT)
3669 r = RET_NERRNO(chdir(wd));
3670 else {
3671 _cleanup_close_ int dfd = -EBADF;
3672
3673 r = chase(wd,
3674 runtime->ephemeral_copy ?: context->root_directory,
3675 CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3676 /* ret_path= */ NULL,
3677 &dfd);
3678 if (r >= 0)
3679 r = RET_NERRNO(fchdir(dfd));
3680 }
3681 return context->working_directory_missing_ok ? 0 : r;
3682 }
3683
3684 static int apply_root_directory(
3685 const ExecContext *context,
3686 const ExecParameters *params,
3687 ExecRuntime *runtime,
3688 const bool needs_mount_ns,
3689 int *exit_status) {
3690
3691 assert(context);
3692 assert(params);
3693 assert(runtime);
3694 assert(exit_status);
3695
3696 if (params->flags & EXEC_APPLY_CHROOT)
3697 if (!needs_mount_ns && context->root_directory)
3698 if (chroot(runtime->ephemeral_copy ?: context->root_directory) < 0) {
3699 *exit_status = EXIT_CHROOT;
3700 return -errno;
3701 }
3702
3703 return 0;
3704 }
3705
3706 static int setup_keyring(
3707 const ExecContext *context,
3708 const ExecParameters *p,
3709 uid_t uid,
3710 gid_t gid) {
3711
3712 key_serial_t keyring;
3713 int r = 0;
3714 uid_t saved_uid;
3715 gid_t saved_gid;
3716
3717 assert(context);
3718 assert(p);
3719
3720 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3721 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3722 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3723 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3724 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3725 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3726
3727 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3728 return 0;
3729
3730 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3731 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3732 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3733 * & group is just as nasty as acquiring a reference to the user keyring. */
3734
3735 saved_uid = getuid();
3736 saved_gid = getgid();
3737
3738 if (gid_is_valid(gid) && gid != saved_gid) {
3739 if (setregid(gid, -1) < 0)
3740 return log_error_errno(errno, "Failed to change GID for user keyring: %m");
3741 }
3742
3743 if (uid_is_valid(uid) && uid != saved_uid) {
3744 if (setreuid(uid, -1) < 0) {
3745 r = log_error_errno(errno, "Failed to change UID for user keyring: %m");
3746 goto out;
3747 }
3748 }
3749
3750 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3751 if (keyring == -1) {
3752 if (errno == ENOSYS)
3753 log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
3754 else if (ERRNO_IS_PRIVILEGE(errno))
3755 log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
3756 else if (errno == EDQUOT)
3757 log_debug_errno(errno, "Out of kernel keyrings to allocate, ignoring.");
3758 else
3759 r = log_error_errno(errno, "Setting up kernel keyring failed: %m");
3760
3761 goto out;
3762 }
3763
3764 /* When requested link the user keyring into the session keyring. */
3765 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3766
3767 if (keyctl(KEYCTL_LINK,
3768 KEY_SPEC_USER_KEYRING,
3769 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3770 r = log_error_errno(errno, "Failed to link user keyring into session keyring: %m");
3771 goto out;
3772 }
3773 }
3774
3775 /* Restore uid/gid back */
3776 if (uid_is_valid(uid) && uid != saved_uid) {
3777 if (setreuid(saved_uid, -1) < 0) {
3778 r = log_error_errno(errno, "Failed to change UID back for user keyring: %m");
3779 goto out;
3780 }
3781 }
3782
3783 if (gid_is_valid(gid) && gid != saved_gid) {
3784 if (setregid(saved_gid, -1) < 0)
3785 return log_error_errno(errno, "Failed to change GID back for user keyring: %m");
3786 }
3787
3788 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3789 if (!sd_id128_is_null(p->invocation_id)) {
3790 key_serial_t key;
3791
3792 key = add_key("user",
3793 "invocation_id",
3794 &p->invocation_id,
3795 sizeof(p->invocation_id),
3796 KEY_SPEC_SESSION_KEYRING);
3797 if (key == -1)
3798 log_debug_errno(errno, "Failed to add invocation ID to keyring, ignoring: %m");
3799 else {
3800 if (keyctl(KEYCTL_SETPERM, key,
3801 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3802 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3803 r = log_error_errno(errno, "Failed to restrict invocation ID permission: %m");
3804 }
3805 }
3806
3807 out:
3808 /* Revert back uid & gid for the last time, and exit */
3809 /* no extra logging, as only the first already reported error matters */
3810 if (getuid() != saved_uid)
3811 (void) setreuid(saved_uid, -1);
3812
3813 if (getgid() != saved_gid)
3814 (void) setregid(saved_gid, -1);
3815
3816 return r;
3817 }
3818
3819 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3820 assert(array);
3821 assert(n);
3822 assert(pair);
3823
3824 if (pair[0] >= 0)
3825 array[(*n)++] = pair[0];
3826 if (pair[1] >= 0)
3827 array[(*n)++] = pair[1];
3828 }
3829
3830 static int close_remaining_fds(
3831 const ExecParameters *params,
3832 const ExecRuntime *runtime,
3833 int socket_fd,
3834 const int *fds,
3835 size_t n_fds) {
3836
3837 size_t n_dont_close = 0;
3838 int dont_close[n_fds + 17];
3839
3840 assert(params);
3841 assert(runtime);
3842
3843 if (params->stdin_fd >= 0)
3844 dont_close[n_dont_close++] = params->stdin_fd;
3845 if (params->stdout_fd >= 0)
3846 dont_close[n_dont_close++] = params->stdout_fd;
3847 if (params->stderr_fd >= 0)
3848 dont_close[n_dont_close++] = params->stderr_fd;
3849
3850 if (socket_fd >= 0)
3851 dont_close[n_dont_close++] = socket_fd;
3852 if (n_fds > 0) {
3853 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3854 n_dont_close += n_fds;
3855 }
3856
3857 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3858
3859 if (runtime->shared) {
3860 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3861 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3862 }
3863
3864 if (runtime->dynamic_creds) {
3865 if (runtime->dynamic_creds->user)
3866 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3867 if (runtime->dynamic_creds->group)
3868 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3869 }
3870
3871 if (params->user_lookup_fd >= 0)
3872 dont_close[n_dont_close++] = params->user_lookup_fd;
3873
3874 if (params->handoff_timestamp_fd >= 0)
3875 dont_close[n_dont_close++] = params->handoff_timestamp_fd;
3876
3877 if (params->pidref_transport_fd >= 0)
3878 dont_close[n_dont_close++] = params->pidref_transport_fd;
3879
3880 assert(n_dont_close <= ELEMENTSOF(dont_close));
3881
3882 return close_all_fds(dont_close, n_dont_close);
3883 }
3884
3885 static int send_user_lookup(
3886 const char *unit_id,
3887 int user_lookup_fd,
3888 uid_t uid,
3889 gid_t gid) {
3890
3891 assert(unit_id);
3892
3893 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3894 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3895 * specified. */
3896
3897 if (user_lookup_fd < 0)
3898 return 0;
3899
3900 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3901 return 0;
3902
3903 if (writev(user_lookup_fd,
3904 (struct iovec[]) {
3905 IOVEC_MAKE(&uid, sizeof(uid)),
3906 IOVEC_MAKE(&gid, sizeof(gid)),
3907 IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3908 return -errno;
3909
3910 return 0;
3911 }
3912
3913 static int acquire_home(const ExecContext *c, const char **home, char **ret_buf) {
3914 int r;
3915
3916 assert(c);
3917 assert(home);
3918 assert(ret_buf);
3919
3920 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3921
3922 if (*home) /* Already acquired from get_fixed_user()? */
3923 return 0;
3924
3925 if (!c->working_directory_home)
3926 return 0;
3927
3928 if (c->dynamic_user || (c->user && is_this_me(c->user) <= 0))
3929 return -EADDRNOTAVAIL;
3930
3931 r = get_home_dir(ret_buf);
3932 if (r < 0)
3933 return r;
3934
3935 *home = *ret_buf;
3936 return 1;
3937 }
3938
3939 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3940 _cleanup_strv_free_ char ** list = NULL;
3941 int r;
3942
3943 assert(c);
3944 assert(p);
3945 assert(ret);
3946
3947 assert(c->dynamic_user);
3948
3949 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3950 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3951 * directories. */
3952
3953 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3954
3955 if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(t))
3956 continue;
3957
3958 if (!p->prefix[t])
3959 continue;
3960
3961 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3962 char *e;
3963
3964 if (exec_directory_is_private(c, t))
3965 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3966 else
3967 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3968 if (!e)
3969 return -ENOMEM;
3970
3971 r = strv_consume(&list, e);
3972 if (r < 0)
3973 return r;
3974 }
3975 }
3976
3977 *ret = TAKE_PTR(list);
3978
3979 return 0;
3980 }
3981
3982 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3983 int r;
3984
3985 assert(c);
3986 assert(ret);
3987
3988 if (!c->numa_policy.nodes.set) {
3989 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3990 *ret = (CPUSet) {};
3991 return 0;
3992 }
3993
3994 _cleanup_(cpu_set_done) CPUSet s = {};
3995 r = numa_to_cpu_set(&c->numa_policy, &s);
3996 if (r < 0)
3997 return r;
3998
3999 *ret = TAKE_STRUCT(s);
4000 return 0;
4001 }
4002
4003 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
4004 int r;
4005
4006 assert(fds);
4007 assert(n_fds);
4008 assert(*n_fds < fds_size);
4009 assert(fd);
4010
4011 if (*fd < 0)
4012 return 0;
4013
4014 if (*fd < 3 + (int) *n_fds) {
4015 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4016 * the fds we pass to the process (or which are closed only during execve). */
4017
4018 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4019 if (r < 0)
4020 return -errno;
4021
4022 close_and_replace(*fd, r);
4023 }
4024
4025 fds[(*n_fds)++] = *fd;
4026 return 1;
4027 }
4028
4029 static int connect_unix_harder(const OpenFile *of, int ofd) {
4030 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4031
4032 union sockaddr_union addr = {
4033 .un.sun_family = AF_UNIX,
4034 };
4035 socklen_t sa_len;
4036 int r;
4037
4038 assert(of);
4039 assert(ofd >= 0);
4040
4041 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4042 if (r < 0)
4043 return log_debug_errno(r, "Failed to set sockaddr for '%s': %m", of->path);
4044 sa_len = r;
4045
4046 FOREACH_ELEMENT(i, socket_types) {
4047 _cleanup_close_ int fd = -EBADF;
4048
4049 fd = socket(AF_UNIX, *i|SOCK_CLOEXEC, 0);
4050 if (fd < 0)
4051 return log_debug_errno(errno, "Failed to create socket for '%s': %m", of->path);
4052
4053 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4054 if (r >= 0)
4055 return TAKE_FD(fd);
4056 if (r != -EPROTOTYPE)
4057 return log_debug_errno(r, "Failed to connect to socket for '%s': %m", of->path);
4058 }
4059
4060 return log_debug_errno(SYNTHETIC_ERRNO(EPROTOTYPE), "No suitable socket type to connect to socket '%s'.", of->path);
4061 }
4062
4063 static int get_open_file_fd(const OpenFile *of) {
4064 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4065 struct stat st;
4066
4067 assert(of);
4068
4069 ofd = open(of->path, O_PATH | O_CLOEXEC);
4070 if (ofd < 0)
4071 return log_debug_errno(errno, "Failed to open '%s' as O_PATH: %m", of->path);
4072
4073 if (fstat(ofd, &st) < 0)
4074 return log_debug_errno( errno, "Failed to stat '%s': %m", of->path);
4075
4076 if (S_ISSOCK(st.st_mode)) {
4077 fd = connect_unix_harder(of, ofd);
4078 if (fd < 0)
4079 return fd;
4080
4081 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4082 return log_debug_errno(errno, "Failed to shutdown send for socket '%s': %m", of->path);
4083
4084 log_debug("Opened socket '%s' as fd %d.", of->path, fd);
4085 } else {
4086 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4087 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4088 flags |= O_APPEND;
4089 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4090 flags |= O_TRUNC;
4091
4092 fd = fd_reopen(ofd, flags|O_NOCTTY|O_CLOEXEC);
4093 if (fd < 0)
4094 return log_debug_errno(fd, "Failed to reopen file '%s': %m", of->path);
4095
4096 log_debug("Opened file '%s' as fd %d.", of->path, fd);
4097 }
4098
4099 return TAKE_FD(fd);
4100 }
4101
4102 static int collect_open_file_fds(ExecParameters *p, size_t *n_fds) {
4103 assert(p);
4104 assert(n_fds);
4105
4106 LIST_FOREACH(open_files, of, p->open_files) {
4107 _cleanup_close_ int fd = -EBADF;
4108
4109 fd = get_open_file_fd(of);
4110 if (fd < 0) {
4111 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4112 log_full_errno(fd == -ENOENT || ERRNO_IS_NEG_PRIVILEGE(fd) ? LOG_DEBUG : LOG_WARNING,
4113 fd,
4114 "Failed to get OpenFile= file descriptor for '%s', ignoring: %m",
4115 of->path);
4116 continue;
4117 }
4118
4119 return log_error_errno(fd, "Failed to get OpenFile= file descriptor for '%s': %m", of->path);
4120 }
4121
4122 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
4123 return log_oom();
4124
4125 if (strv_extend(&p->fd_names, of->fdname) < 0)
4126 return log_oom();
4127
4128 p->fds[(*n_fds)++] = TAKE_FD(fd);
4129 }
4130
4131 return 0;
4132 }
4133
4134 static void log_command_line(
4135 const ExecContext *context,
4136 const ExecParameters *params,
4137 const char *msg,
4138 const char *executable,
4139 char **argv) {
4140
4141 assert(context);
4142 assert(params);
4143 assert(msg);
4144 assert(executable);
4145
4146 if (!DEBUG_LOGGING)
4147 return;
4148
4149 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4150
4151 log_struct(LOG_DEBUG,
4152 LOG_ITEM("EXECUTABLE=%s", executable),
4153 LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
4154 LOG_EXEC_INVOCATION_ID(params));
4155 }
4156
4157 static bool exec_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
4158 assert(context);
4159
4160 return context->private_users != PRIVATE_USERS_NO ||
4161 context->private_tmp != PRIVATE_TMP_NO ||
4162 context->private_devices ||
4163 context->private_network ||
4164 context->network_namespace_path ||
4165 context->private_ipc ||
4166 context->ipc_namespace_path ||
4167 context->private_mounts > 0 ||
4168 context->mount_apivfs > 0 ||
4169 context->bind_log_sockets > 0 ||
4170 context->n_bind_mounts > 0 ||
4171 context->n_temporary_filesystems > 0 ||
4172 context->root_directory ||
4173 !strv_isempty(context->extension_directories) ||
4174 context->protect_system != PROTECT_SYSTEM_NO ||
4175 context->protect_home != PROTECT_HOME_NO ||
4176 exec_needs_pid_namespace(context, params) ||
4177 context->protect_kernel_tunables ||
4178 context->protect_kernel_modules ||
4179 context->protect_kernel_logs ||
4180 exec_needs_cgroup_mount(context) ||
4181 context->protect_clock ||
4182 context->protect_hostname != PROTECT_HOSTNAME_NO ||
4183 !strv_isempty(context->read_write_paths) ||
4184 !strv_isempty(context->read_only_paths) ||
4185 !strv_isempty(context->inaccessible_paths) ||
4186 !strv_isempty(context->exec_paths) ||
4187 !strv_isempty(context->no_exec_paths) ||
4188 context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL;
4189 }
4190
4191 static PrivateUsers exec_context_get_effective_private_users(
4192 const ExecContext *context,
4193 const ExecParameters *params) {
4194
4195 assert(context);
4196 assert(params);
4197
4198 if (context->private_users != PRIVATE_USERS_NO)
4199 return context->private_users;
4200
4201 /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
4202 if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
4203 return PRIVATE_USERS_SELF;
4204
4205 return PRIVATE_USERS_NO;
4206 }
4207
4208 static bool exec_namespace_is_delegated(
4209 const ExecContext *context,
4210 const ExecParameters *params,
4211 bool have_cap_sys_admin,
4212 unsigned long namespace) {
4213
4214 assert(context);
4215 assert(params);
4216 assert(namespace != CLONE_NEWUSER);
4217
4218 /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
4219 * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
4220 * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
4221 if (!have_cap_sys_admin && exec_needs_cap_sys_admin(context, params))
4222 return false;
4223
4224 if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
4225 return params->runtime_scope == RUNTIME_SCOPE_USER;
4226
4227 if (FLAGS_SET(context->delegate_namespaces, namespace))
4228 return true;
4229
4230 /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
4231 * those are delegated mountns must be deferred too.
4232 *
4233 * The list should stay in sync with exec_needs_mount_namespace(). */
4234 if (namespace == CLONE_NEWNS)
4235 return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET);
4236
4237 return false;
4238 }
4239
4240 static int setup_delegated_namespaces(
4241 const ExecContext *context,
4242 ExecParameters *params,
4243 ExecRuntime *runtime,
4244 bool delegate,
4245 const char *memory_pressure_path,
4246 uid_t uid,
4247 uid_t gid,
4248 const ExecCommand *command,
4249 bool needs_sandboxing,
4250 bool have_cap_sys_admin,
4251 int *reterr_exit_status) {
4252
4253 int r;
4254
4255 /* This function is called twice, once before unsharing the user namespace, and once after unsharing
4256 * the user namespace. When called before unsharing the user namespace, "delegate" is set to "false".
4257 * When called after unsharing the user namespace, "delegate" is set to "true". The net effect is
4258 * that all namespaces that should not be delegated are unshared when this function is called the
4259 * first time and all namespaces that should be delegated are unshared when this function is called
4260 * the second time. */
4261
4262 assert(context);
4263 assert(params);
4264 assert(runtime);
4265 assert(reterr_exit_status);
4266
4267 if (exec_needs_network_namespace(context) &&
4268 exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate &&
4269 runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4270
4271 /* Try to enable network namespacing if network namespacing is available and we have
4272 * CAP_NET_ADMIN in the current user namespace (either the system manager one or the unit's
4273 * own user namespace). We need CAP_NET_ADMIN to be able to configure the loopback device in
4274 * the new network namespace. And if we don't have that, then we could only create a network
4275 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4276 if (namespace_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4277 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4278 if (ERRNO_IS_NEG_PRIVILEGE(r))
4279 log_notice_errno(r, "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4280 else if (r < 0) {
4281 *reterr_exit_status = EXIT_NETWORK;
4282 return log_error_errno(r, "Failed to set up network namespacing: %m");
4283 } else
4284 log_debug("Set up %snetwork namespace", delegate ? "delegated " : "");
4285 } else if (context->network_namespace_path) {
4286 *reterr_exit_status = EXIT_NETWORK;
4287 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
4288 } else
4289 log_notice("PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4290 }
4291
4292 if (exec_needs_ipc_namespace(context) &&
4293 exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate &&
4294 runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4295
4296 if (namespace_type_supported(NAMESPACE_IPC)) {
4297 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4298 if (ERRNO_IS_NEG_PRIVILEGE(r))
4299 log_warning_errno(r, "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4300 else if (r < 0) {
4301 *reterr_exit_status = EXIT_NAMESPACE;
4302 return log_error_errno(r, "Failed to set up IPC namespacing: %m");
4303 } else
4304 log_debug("Set up %sIPC namespace", delegate ? "delegated " : "");
4305 } else if (context->ipc_namespace_path) {
4306 *reterr_exit_status = EXIT_NAMESPACE;
4307 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "IPCNamespacePath= is not supported, refusing.");
4308 } else
4309 log_warning("PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4310 }
4311
4312 if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
4313 exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
4314 if (unshare(CLONE_NEWCGROUP) < 0) {
4315 *reterr_exit_status = EXIT_NAMESPACE;
4316 return log_error_errno(errno, "Failed to set up cgroup namespacing: %m");
4317 }
4318
4319 log_debug("Set up %scgroup namespace", delegate ? "delegated " : "");
4320 }
4321
4322 /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
4323 * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
4324 if (needs_sandboxing && exec_needs_pid_namespace(context, params) &&
4325 exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
4326 if (params->pidref_transport_fd < 0) {
4327 *reterr_exit_status = EXIT_NAMESPACE;
4328 return log_error_errno(SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
4329 }
4330
4331 /* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need
4332 * to check if we can mount /proc/.
4333 *
4334 * We need to check prior to entering the user namespace because if we're running unprivileged or in a
4335 * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
4336 * once we unshare a mount namespace. */
4337 if (!have_cap_sys_admin || delegate) {
4338 r = can_mount_proc();
4339 if (r < 0) {
4340 *reterr_exit_status = EXIT_NAMESPACE;
4341 return log_error_errno(r, "Failed to detect if /proc/ can be remounted: %m");
4342 }
4343 if (r == 0) {
4344 *reterr_exit_status = EXIT_NAMESPACE;
4345 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
4346 "PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing.");
4347 }
4348 }
4349
4350 r = setup_private_pids(context, params);
4351 if (r < 0) {
4352 *reterr_exit_status = EXIT_NAMESPACE;
4353 return log_error_errno(r, "Failed to set up pid namespace: %m");
4354 }
4355
4356 log_debug("Set up %spid namespace", delegate ? "delegated " : "");
4357 }
4358
4359 /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
4360
4361 if (exec_needs_mount_namespace(context, params, runtime) &&
4362 exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) {
4363 _cleanup_free_ char *error_path = NULL;
4364
4365 r = apply_mount_namespace(command->flags,
4366 context,
4367 params,
4368 runtime,
4369 memory_pressure_path,
4370 needs_sandboxing,
4371 &error_path,
4372 uid,
4373 gid);
4374 if (r < 0) {
4375 *reterr_exit_status = EXIT_NAMESPACE;
4376 return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
4377 error_path ? ": " : "", strempty(error_path));
4378 }
4379
4380 log_debug("Set up %smount namespace", delegate ? "delegated " : "");
4381 }
4382
4383 if (needs_sandboxing &&
4384 exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) {
4385 r = apply_protect_hostname(context, params, reterr_exit_status);
4386 if (r < 0)
4387 return r;
4388 if (r > 0)
4389 log_debug("Set up %sUTS namespace", delegate ? "delegated " : "");
4390 }
4391
4392 return 0;
4393 }
4394
4395 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
4396 assert(context);
4397
4398 if (confirm_spawn_disabled())
4399 return false;
4400
4401 /* For some reasons units remaining in the same process group
4402 * as PID 1 fail to acquire the console even if it's not used
4403 * by any process. So skip the confirmation question for them. */
4404 return !context->same_pgrp;
4405 }
4406
4407 static int exec_context_named_iofds(
4408 const ExecContext *c,
4409 const ExecParameters *p,
4410 int named_iofds[static 3]) {
4411
4412 size_t targets;
4413 const char* stdio_fdname[3];
4414 size_t n_fds;
4415
4416 assert(c);
4417 assert(p);
4418 assert(named_iofds);
4419
4420 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4421 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4422 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4423
4424 for (size_t i = 0; i < 3; i++)
4425 stdio_fdname[i] = exec_context_fdname(c, i);
4426
4427 n_fds = p->n_storage_fds + p->n_socket_fds + p->n_extra_fds;
4428
4429 for (size_t i = 0; i < n_fds && targets > 0; i++)
4430 if (named_iofds[STDIN_FILENO] < 0 &&
4431 c->std_input == EXEC_INPUT_NAMED_FD &&
4432 stdio_fdname[STDIN_FILENO] &&
4433 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4434
4435 named_iofds[STDIN_FILENO] = p->fds[i];
4436 targets--;
4437
4438 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4439 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4440 stdio_fdname[STDOUT_FILENO] &&
4441 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4442
4443 named_iofds[STDOUT_FILENO] = p->fds[i];
4444 targets--;
4445
4446 } else if (named_iofds[STDERR_FILENO] < 0 &&
4447 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4448 stdio_fdname[STDERR_FILENO] &&
4449 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4450
4451 named_iofds[STDERR_FILENO] = p->fds[i];
4452 targets--;
4453 }
4454
4455 return targets == 0 ? 0 : -ENOENT;
4456 }
4457
4458 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
4459 if (!shared)
4460 return;
4461
4462 safe_close_pair(shared->netns_storage_socket);
4463 safe_close_pair(shared->ipcns_storage_socket);
4464 }
4465
4466 static void exec_runtime_close(ExecRuntime *rt) {
4467 if (!rt)
4468 return;
4469
4470 safe_close_pair(rt->ephemeral_storage_socket);
4471
4472 exec_shared_runtime_close(rt->shared);
4473 dynamic_creds_close(rt->dynamic_creds);
4474 }
4475
4476 static void exec_params_close(ExecParameters *p) {
4477 if (!p)
4478 return;
4479
4480 p->stdin_fd = safe_close(p->stdin_fd);
4481 p->stdout_fd = safe_close(p->stdout_fd);
4482 p->stderr_fd = safe_close(p->stderr_fd);
4483 }
4484
4485 static int exec_fd_mark_hot(
4486 const ExecContext *c,
4487 ExecParameters *p,
4488 bool hot,
4489 int *reterr_exit_status) {
4490
4491 assert(c);
4492 assert(p);
4493
4494 if (p->exec_fd < 0)
4495 return 0;
4496
4497 uint8_t x = hot;
4498
4499 if (write(p->exec_fd, &x, sizeof(x)) < 0) {
4500 if (reterr_exit_status)
4501 *reterr_exit_status = EXIT_EXEC;
4502 return log_error_errno(errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold");
4503 }
4504
4505 return 1;
4506 }
4507
4508 static int send_handoff_timestamp(
4509 const ExecContext *c,
4510 ExecParameters *p,
4511 int *reterr_exit_status) {
4512
4513 assert(c);
4514 assert(p);
4515
4516 if (p->handoff_timestamp_fd < 0)
4517 return 0;
4518
4519 dual_timestamp dt;
4520 dual_timestamp_now(&dt);
4521
4522 if (write(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2) < 0) {
4523 if (reterr_exit_status)
4524 *reterr_exit_status = EXIT_EXEC;
4525 return log_error_errno(errno, "Failed to send handoff timestamp: %m");
4526 }
4527
4528 return 1;
4529 }
4530
4531 static void prepare_terminal(
4532 const ExecContext *context,
4533 ExecParameters *p) {
4534
4535 _cleanup_close_ int lock_fd = -EBADF;
4536
4537 /* This is the "constructive" reset, i.e. is about preparing things for our invocation rather than
4538 * cleaning up things from older invocations. */
4539
4540 assert(context);
4541 assert(p);
4542
4543 /* We only try to reset things if we there's the chance our stdout points to a TTY */
4544 if (!(is_terminal_output(context->std_output) ||
4545 (context->std_output == EXEC_OUTPUT_INHERIT && is_terminal_input(context->std_input)) ||
4546 context->std_output == EXEC_OUTPUT_NAMED_FD ||
4547 p->stdout_fd >= 0))
4548 return;
4549
4550 /* Let's explicitly determine whether to reset via ANSI sequences or not, taking our ExecContext
4551 * information into account */
4552 bool use_ansi = exec_context_shall_ansi_seq_reset(context);
4553
4554 if (context->tty_reset) {
4555 /* When we are resetting the TTY, then let's create a lock first, to synchronize access. This
4556 * in particular matters as concurrent resets and the TTY size ANSI DSR logic done by the
4557 * exec_context_apply_tty_size() below might interfere */
4558 lock_fd = lock_dev_console();
4559 if (lock_fd < 0)
4560 log_debug_errno(lock_fd, "Failed to lock /dev/console, ignoring: %m");
4561
4562 /* We explicitly control whether to send ansi sequences or not here, since we want to consult
4563 * the env vars explicitly configured in the ExecContext, rather than our own environment
4564 * block. */
4565 (void) terminal_reset_defensive(STDOUT_FILENO, use_ansi ? TERMINAL_RESET_FORCE_ANSI_SEQ : TERMINAL_RESET_AVOID_ANSI_SEQ);
4566 }
4567
4568 (void) exec_context_apply_tty_size(context, STDIN_FILENO, STDOUT_FILENO, /* tty_path= */ NULL);
4569
4570 if (use_ansi)
4571 (void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
4572 }
4573
4574 static int setup_term_environment(const ExecContext *context, char ***env) {
4575 int r;
4576
4577 assert(context);
4578 assert(env);
4579
4580 /* Already specified by user? */
4581 if (strv_env_get(*env, "TERM"))
4582 return 0;
4583
4584 /* Do we need $TERM at all? */
4585 if (!is_terminal_input(context->std_input) &&
4586 !is_terminal_output(context->std_output) &&
4587 !is_terminal_output(context->std_error) &&
4588 !context->tty_path)
4589 return 0;
4590
4591 const char *tty_path = exec_context_tty_path(context);
4592 if (tty_path) {
4593 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
4594 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
4595 * container manager passes to PID 1 ends up all the way in the console login shown.
4596 *
4597 * Note that if this doesn't work out we won't bother with querying systemd.tty.term.console
4598 * kernel cmdline option or DCS anymore either, because pid1 also imports $TERM based on those
4599 * and it should have showed up as our $TERM if there were anything. */
4600 if (tty_is_console(tty_path) && getppid() == 1) {
4601 const char *term = strv_find_prefix(environ, "TERM=");
4602 if (term) {
4603 r = strv_env_replace_strdup(env, term);
4604 if (r < 0)
4605 return r;
4606
4607 FOREACH_STRING(i, "COLORTERM=", "NO_COLOR=") {
4608 const char *s = strv_find_prefix(environ, i);
4609 if (!s)
4610 continue;
4611
4612 r = strv_env_replace_strdup(env, s);
4613 if (r < 0)
4614 return r;
4615 }
4616
4617 return 1;
4618 }
4619
4620 } else {
4621 if (in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
4622 _cleanup_free_ char *key = NULL, *cmdline = NULL;
4623
4624 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
4625 if (!key)
4626 return -ENOMEM;
4627
4628 r = proc_cmdline_get_key(key, /* flags = */ 0, &cmdline);
4629 if (r > 0)
4630 return strv_env_assign(env, "TERM", cmdline);
4631 if (r < 0)
4632 log_debug_errno(r, "Failed to read '%s' from kernel cmdline, ignoring: %m", key);
4633 }
4634
4635 /* This handles real virtual terminals (returning "linux") and
4636 * any terminals which support the DCS +q query sequence. */
4637 _cleanup_free_ char *dcs_term = NULL;
4638 r = query_term_for_tty(tty_path, &dcs_term);
4639 if (r >= 0)
4640 return strv_env_assign(env, "TERM", dcs_term);
4641 }
4642 }
4643
4644 /* If $TERM is not known and we pick a fallback default, then let's also set
4645 * $COLORTERM=truecolor. That's because our fallback default is vt220, which is
4646 * generally a safe bet (as it supports PageUp/PageDown unlike vt100, and is quite
4647 * universally available in terminfo/termcap), except for the fact that real DEC
4648 * vt220 gear never actually supported color. Most tools these days generate color on
4649 * vt220 anyway, ignoring the physical capabilities of the real hardware, but some
4650 * tools actually believe in the historical truth. Which is unfortunate since *we*
4651 * *don't* care about the historical truth, we just want sane defaults if nothing
4652 * better is explicitly configured. It's 2025 after all, at the time of writing,
4653 * pretty much all terminal emulators actually *do* support color, hence if we don't
4654 * know any better let's explicitly claim color support via $COLORTERM. Or in other
4655 * words: we now explicitly claim to be connected to a franken-vt220 with true color
4656 * support. */
4657 r = strv_env_replace_strdup(env, "COLORTERM=truecolor");
4658 if (r < 0)
4659 return r;
4660
4661 return strv_env_replace_strdup(env, "TERM=" FALLBACK_TERM);
4662 }
4663
4664 int exec_invoke(
4665 const ExecCommand *command,
4666 const ExecContext *context,
4667 ExecParameters *params,
4668 ExecRuntime *runtime,
4669 const CGroupContext *cgroup_context,
4670 int *exit_status) {
4671
4672 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL;
4673 int r;
4674 const char *username = NULL, *groupname = NULL;
4675 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL, *own_user = NULL;
4676 const char *pwent_home = NULL, *shell = NULL;
4677 dev_t journal_stream_dev = 0;
4678 ino_t journal_stream_ino = 0;
4679 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4680 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4681 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4682 have_cap_sys_admin,
4683 userns_set_up = false,
4684 keep_seccomp_privileges = false;
4685 #if HAVE_SELINUX
4686 _cleanup_free_ char *mac_selinux_context_net = NULL;
4687 bool use_selinux = false;
4688 #endif
4689 #if ENABLE_SMACK
4690 bool use_smack = false;
4691 #endif
4692 #if HAVE_APPARMOR
4693 bool use_apparmor = false;
4694 #endif
4695 #if HAVE_SECCOMP
4696 uint64_t saved_bset = 0;
4697 #endif
4698 uid_t saved_uid = getuid();
4699 gid_t saved_gid = getgid();
4700 uid_t uid = UID_INVALID;
4701 gid_t gid = GID_INVALID;
4702 size_t n_fds, /* fds to pass to the child */
4703 n_keep_fds; /* total number of fds not to close */
4704 int secure_bits;
4705 _cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
4706 int ngids = 0, ngids_after_pam = 0;
4707 int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
4708 size_t n_storage_fds, n_socket_fds, n_extra_fds;
4709
4710 assert(command);
4711 assert(context);
4712 assert(params);
4713 assert(runtime);
4714 assert(cgroup_context);
4715 assert(exit_status);
4716
4717 LOG_CONTEXT_PUSH_EXEC(context, params);
4718
4719 /* Explicitly test for CVE-2021-4034 inspired invocations */
4720 if (!command->path || strv_isempty(command->argv)) {
4721 *exit_status = EXIT_EXEC;
4722 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid command line arguments.");
4723 }
4724
4725 if (context->std_input == EXEC_INPUT_SOCKET ||
4726 context->std_output == EXEC_OUTPUT_SOCKET ||
4727 context->std_error == EXEC_OUTPUT_SOCKET) {
4728
4729 if (params->n_socket_fds > 1)
4730 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4731
4732 if (params->n_socket_fds == 0)
4733 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4734
4735 socket_fd = params->fds[0];
4736 n_storage_fds = n_socket_fds = n_extra_fds = 0;
4737 } else {
4738 n_socket_fds = params->n_socket_fds;
4739 n_storage_fds = params->n_storage_fds;
4740 n_extra_fds = params->n_extra_fds;
4741 }
4742 n_fds = n_socket_fds + n_storage_fds + n_extra_fds;
4743
4744 r = exec_context_named_iofds(context, params, named_iofds);
4745 if (r < 0)
4746 return log_error_errno(r, "Failed to load a named file descriptor: %m");
4747
4748 rename_process_from_path(command->path);
4749
4750 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4751 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4752 * both of which will be demoted to SIG_DFL. */
4753 (void) default_signals(SIGNALS_CRASH_HANDLER,
4754 SIGNALS_IGNORE);
4755
4756 if (context->ignore_sigpipe)
4757 (void) ignore_signals(SIGPIPE);
4758
4759 r = reset_signal_mask();
4760 if (r < 0) {
4761 *exit_status = EXIT_SIGNAL_MASK;
4762 return log_error_errno(r, "Failed to set process signal mask: %m");
4763 }
4764
4765 if (params->idle_pipe)
4766 do_idle_pipe_dance(params->idle_pipe);
4767
4768 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4769 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4770 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4771 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4772
4773 log_forget_fds();
4774 log_set_open_when_needed(true);
4775 log_settle_target();
4776
4777 /* In case anything used libc syslog(), close this here, too */
4778 closelog();
4779
4780 r = collect_open_file_fds(params, &n_fds);
4781 if (r < 0) {
4782 *exit_status = EXIT_FDS;
4783 return log_error_errno(r, "Failed to get OpenFile= file descriptors: %m");
4784 }
4785
4786 int keep_fds[n_fds + 4];
4787 memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4788 n_keep_fds = n_fds;
4789
4790 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4791 if (r < 0) {
4792 *exit_status = EXIT_FDS;
4793 return log_error_errno(r, "Failed to collect shifted fd: %m");
4794 }
4795
4796 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->handoff_timestamp_fd);
4797 if (r < 0) {
4798 *exit_status = EXIT_FDS;
4799 return log_error_errno(r, "Failed to collect shifted fd: %m");
4800 }
4801
4802 #if HAVE_LIBBPF
4803 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
4804 if (r < 0) {
4805 *exit_status = EXIT_FDS;
4806 return log_error_errno(r, "Failed to collect shifted fd: %m");
4807 }
4808 #endif
4809
4810 r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4811 if (r < 0) {
4812 *exit_status = EXIT_FDS;
4813 return log_error_errno(r, "Failed to close unwanted file descriptors: %m");
4814 }
4815
4816 if (!context->same_pgrp &&
4817 setsid() < 0) {
4818 *exit_status = EXIT_SETSID;
4819 return log_error_errno(errno, "Failed to create new process session: %m");
4820 }
4821
4822 /* Now, reset the TTY associated to this service "destructively" (i.e. possibly even hang up or
4823 * disallocate the VT), to get rid of any prior uses of the device. Note that we do not keep any fd
4824 * open here, hence some of the settings made here might vanish again, depending on the TTY driver
4825 * used. A 2nd ("constructive") initialization after we opened the input/output fds we actually want
4826 * will fix this. Note that we pass a NULL invocation ID here – as exec_context_tty_reset() expects
4827 * the invocation ID associated with the OSC 3008 context ID to close. But we don't want to close any
4828 * OSC 3008 context here, and opening a fresh OSC 3008 context happens a bit further down. */
4829 exec_context_tty_reset(context, params, /* invocation_id= */ SD_ID128_NULL);
4830
4831 if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4832 _cleanup_free_ char *cmdline = NULL;
4833
4834 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4835 if (!cmdline) {
4836 *exit_status = EXIT_MEMORY;
4837 return log_oom();
4838 }
4839
4840 r = ask_for_confirmation(context, params, cmdline);
4841 if (r != CONFIRM_EXECUTE) {
4842 if (r == CONFIRM_PRETEND_SUCCESS) {
4843 *exit_status = EXIT_SUCCESS;
4844 return 0;
4845 }
4846
4847 *exit_status = EXIT_CONFIRM;
4848 return log_error_errno(SYNTHETIC_ERRNO(ECANCELED), "Execution cancelled by the user.");
4849 }
4850 }
4851
4852 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4853 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4854 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4855 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4856 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4857 if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4858 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4859 *exit_status = EXIT_MEMORY;
4860 return log_error_errno(errno, "Failed to update environment: %m");
4861 }
4862
4863 if (context->dynamic_user && runtime->dynamic_creds) {
4864 _cleanup_strv_free_ char **suggested_paths = NULL;
4865
4866 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4867 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4868 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4869 *exit_status = EXIT_USER;
4870 return log_error_errno(errno, "Failed to update environment: %m");
4871 }
4872
4873 r = compile_suggested_paths(context, params, &suggested_paths);
4874 if (r < 0) {
4875 *exit_status = EXIT_MEMORY;
4876 return log_oom();
4877 }
4878
4879 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4880 if (r < 0) {
4881 *exit_status = EXIT_USER;
4882 if (r == -EILSEQ)
4883 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
4884 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4885 return log_error_errno(r, "Failed to update dynamic user credentials: %m");
4886 }
4887
4888 if (!uid_is_valid(uid)) {
4889 *exit_status = EXIT_USER;
4890 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\".", uid);
4891 }
4892
4893 if (!gid_is_valid(gid)) {
4894 *exit_status = EXIT_USER;
4895 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\".", gid);
4896 }
4897
4898 if (runtime->dynamic_creds->user)
4899 username = runtime->dynamic_creds->user->name;
4900
4901 } else {
4902 const char *u;
4903
4904 if (context->user)
4905 u = context->user;
4906 else if (context->pam_name || FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
4907 /* If PAM is enabled but no user name is explicitly selected, then use our own one. */
4908 own_user = getusername_malloc();
4909 if (!own_user) {
4910 *exit_status = EXIT_USER;
4911 return log_error_errno(r, "Failed to determine my own user ID: %m");
4912 }
4913 u = own_user;
4914 } else
4915 u = NULL;
4916
4917 if (u) {
4918 /* We can't use nss unconditionally for root without risking deadlocks if some IPC services
4919 * will be started by pid1 and are ordered after us. But if SetLoginEnvironment= is
4920 * enabled *explicitly* (i.e. no exec_context_get_set_login_environment() here),
4921 * or PAM shall be invoked, let's consult NSS even for root, so that the user
4922 * gets accurate $SHELL in session(-like) contexts. */
4923 r = get_fixed_user(u,
4924 /* prefer_nss = */ context->set_login_environment > 0 || context->pam_name,
4925 &username, &uid, &gid, &pwent_home, &shell);
4926 if (r < 0) {
4927 *exit_status = EXIT_USER;
4928 return log_error_errno(r, "Failed to determine user credentials: %m");
4929 }
4930 }
4931
4932 if (context->group) {
4933 r = get_fixed_group(context->group, &groupname, &gid);
4934 if (r < 0) {
4935 *exit_status = EXIT_GROUP;
4936 return log_error_errno(r, "Failed to determine group credentials: %m");
4937 }
4938 }
4939 }
4940
4941 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4942 ngids = get_supplementary_groups(context, username, gid, &gids);
4943 if (ngids < 0) {
4944 *exit_status = EXIT_GROUP;
4945 return log_error_errno(ngids, "Failed to determine supplementary groups: %m");
4946 }
4947
4948 r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4949 if (r < 0) {
4950 *exit_status = EXIT_USER;
4951 return log_error_errno(r, "Failed to send user credentials to PID1: %m");
4952 }
4953
4954 params->user_lookup_fd = safe_close(params->user_lookup_fd);
4955
4956 r = acquire_home(context, &pwent_home, &home_buffer);
4957 if (r < 0) {
4958 *exit_status = EXIT_CHDIR;
4959 return log_error_errno(r, "Failed to determine $HOME for the invoking user: %m");
4960 }
4961
4962 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4963 if (socket_fd >= 0)
4964 (void) fd_nonblock(socket_fd, false);
4965
4966 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4967 * from it. */
4968 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4969
4970 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4971 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4972 if (params->cgroup_path) {
4973 _cleanup_free_ char *subcgroup = NULL;
4974
4975 r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup);
4976 if (r < 0) {
4977 *exit_status = EXIT_CGROUP;
4978 return log_error_errno(r, "Failed to acquire cgroup path: %m");
4979 }
4980 if (r > 0) {
4981 /* If there is a subcgroup required, let's make sure to create it now. */
4982 r = cg_create(subcgroup);
4983 if (r < 0)
4984 return log_error_errno(r, "Failed to create subcgroup '%s': %m", subcgroup);
4985 }
4986
4987 /* If we need a cgroup namespace, we cannot yet move the service to its configured subgroup,
4988 * as unsharing the cgroup namespace later on makes the current cgroup the root of the
4989 * namespace and we want the root of the namespace to be the main service cgroup and not the
4990 * subgroup. One edge case is if we're a control process that needs to be spawned in a
4991 * subgroup, in this case, we have no choice as moving into the main service cgroup might
4992 * violate the no inner processes rule of cgroupv2. */
4993 const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context) &&
4994 !exec_params_needs_control_subcgroup(params)
4995 ? params->cgroup_path : subcgroup;
4996
4997 r = cg_attach(cgtarget, 0);
4998 if (r == -EUCLEAN) {
4999 *exit_status = EXIT_CGROUP;
5000 return log_error_errno(r,
5001 "Failed to attach process to cgroup '%s', "
5002 "because the cgroup or one of its parents or "
5003 "siblings is in the threaded mode.", cgtarget);
5004 }
5005 if (r < 0) {
5006 *exit_status = EXIT_CGROUP;
5007 return log_error_errno(r, "Failed to attach to cgroup %s: %m", cgtarget);
5008 }
5009 }
5010
5011 if (context->network_namespace_path && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5012 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5013 if (r < 0) {
5014 *exit_status = EXIT_NETWORK;
5015 return log_error_errno(r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5016 }
5017 }
5018
5019 if (context->ipc_namespace_path && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5020 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5021 if (r < 0) {
5022 *exit_status = EXIT_NAMESPACE;
5023 return log_error_errno(r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5024 }
5025 }
5026
5027 r = setup_input(context, params, socket_fd, named_iofds);
5028 if (r < 0) {
5029 *exit_status = EXIT_STDIN;
5030 return log_error_errno(r, "Failed to set up standard input: %m");
5031 }
5032
5033 _cleanup_free_ char *fname = NULL;
5034 r = path_extract_filename(command->path, &fname);
5035 if (r < 0) {
5036 *exit_status = EXIT_STDOUT;
5037 return log_error_errno(r, "Failed to extract filename from path %s: %m", command->path);
5038 }
5039
5040 r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
5041 if (r < 0) {
5042 *exit_status = EXIT_STDOUT;
5043 return log_error_errno(r, "Failed to set up standard output: %m");
5044 }
5045
5046 r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
5047 if (r < 0) {
5048 *exit_status = EXIT_STDERR;
5049 return log_error_errno(r, "Failed to set up standard error output: %m");
5050 }
5051
5052 /* Now that stdin/stdout are definiely opened, properly initialize it with our desired
5053 * settings. Note: this is a "constructive" reset, it prepares things for us to use. This is
5054 * different from the "destructive" TTY reset further up. Also note: we apply this on stdin/stdout in
5055 * case this is a tty, regardless if we opened it ourselves or got it passed in pre-opened. */
5056 prepare_terminal(context, params);
5057
5058 if (context->oom_score_adjust_set) {
5059 /* When we can't make this change due to EPERM, then let's silently skip over it. User
5060 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
5061 r = set_oom_score_adjust(context->oom_score_adjust);
5062 if (ERRNO_IS_NEG_PRIVILEGE(r))
5063 log_debug_errno(r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5064 else if (r < 0) {
5065 *exit_status = EXIT_OOM_ADJUST;
5066 return log_error_errno(r, "Failed to adjust OOM setting: %m");
5067 }
5068 }
5069
5070 if (context->coredump_filter_set) {
5071 r = set_coredump_filter(context->coredump_filter);
5072 if (ERRNO_IS_NEG_PRIVILEGE(r))
5073 log_debug_errno(r, "Failed to adjust coredump_filter, ignoring: %m");
5074 else if (r < 0) {
5075 *exit_status = EXIT_LIMITS;
5076 return log_error_errno(r, "Failed to adjust coredump_filter: %m");
5077 }
5078 }
5079
5080 if (context->cpu_sched_set) {
5081 struct sched_attr attr = {
5082 .size = sizeof(attr),
5083 .sched_policy = context->cpu_sched_policy,
5084 .sched_priority = context->cpu_sched_priority,
5085 .sched_flags = context->cpu_sched_reset_on_fork ? SCHED_FLAG_RESET_ON_FORK : 0,
5086 };
5087
5088 r = sched_setattr(/* pid= */ 0, &attr, /* flags= */ 0);
5089 if (r < 0) {
5090 *exit_status = EXIT_SETSCHEDULER;
5091 return log_error_errno(errno, "Failed to set up CPU scheduling: %m");
5092 }
5093 }
5094
5095 /*
5096 * Set nice value _after_ the call to sched_setattr() because struct sched_attr includes sched_nice
5097 * which we do not set, thus it will clobber any previously set nice value. Scheduling policy might
5098 * be reasonably set together with nice value e.g. in case of SCHED_BATCH (see sched(7)).
5099 * It would be ideal to set both with the same call, but we cannot easily do so because of all the
5100 * extra logic in setpriority_closest().
5101 */
5102 if (context->nice_set) {
5103 r = setpriority_closest(context->nice);
5104 if (r < 0) {
5105 *exit_status = EXIT_NICE;
5106 return log_error_errno(r, "Failed to set up process scheduling priority (nice level): %m");
5107 }
5108 }
5109
5110 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5111 _cleanup_(cpu_set_done) CPUSet converted_cpu_set = {};
5112 const CPUSet *cpu_set;
5113
5114 if (context->cpu_affinity_from_numa) {
5115 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5116 if (r < 0) {
5117 *exit_status = EXIT_CPUAFFINITY;
5118 return log_error_errno(r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5119 }
5120
5121 cpu_set = &converted_cpu_set;
5122 } else
5123 cpu_set = &context->cpu_set;
5124
5125 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5126 *exit_status = EXIT_CPUAFFINITY;
5127 return log_error_errno(errno, "Failed to set up CPU affinity: %m");
5128 }
5129 }
5130
5131 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5132 r = apply_numa_policy(&context->numa_policy);
5133 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
5134 log_debug_errno(r, "NUMA support not available, ignoring.");
5135 else if (r < 0) {
5136 *exit_status = EXIT_NUMA_POLICY;
5137 return log_error_errno(r, "Failed to set NUMA memory policy: %m");
5138 }
5139 }
5140
5141 if (context->ioprio_set)
5142 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5143 *exit_status = EXIT_IOPRIO;
5144 return log_error_errno(errno, "Failed to set up IO scheduling priority: %m");
5145 }
5146
5147 if (context->timer_slack_nsec != NSEC_INFINITY)
5148 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5149 *exit_status = EXIT_TIMERSLACK;
5150 return log_error_errno(errno, "Failed to set up timer slack: %m");
5151 }
5152
5153 if (context->personality != PERSONALITY_INVALID) {
5154 r = safe_personality(context->personality);
5155 if (r < 0) {
5156 *exit_status = EXIT_PERSONALITY;
5157 return log_error_errno(r, "Failed to set up execution domain (personality): %m");
5158 }
5159 }
5160
5161 if (context->memory_ksm >= 0)
5162 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm, 0, 0, 0) < 0) {
5163 if (ERRNO_IS_NOT_SUPPORTED(errno))
5164 log_debug_errno(errno, "KSM support not available, ignoring.");
5165 else {
5166 *exit_status = EXIT_KSM;
5167 return log_error_errno(errno, "Failed to set KSM: %m");
5168 }
5169 }
5170
5171 #if ENABLE_UTMP
5172 if (context->utmp_id) {
5173 _cleanup_free_ char *username_alloc = NULL;
5174
5175 if (!username && context->utmp_mode == EXEC_UTMP_USER) {
5176 username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
5177 if (!username_alloc) {
5178 *exit_status = EXIT_USER;
5179 return log_oom();
5180 }
5181 }
5182
5183 const char *line = context->tty_path ?
5184 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5185 NULL;
5186 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5187 line,
5188 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
5189 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5190 USER_PROCESS,
5191 username ?: username_alloc);
5192 }
5193 #endif
5194
5195 if (uid_is_valid(uid)) {
5196 r = chown_terminal(STDIN_FILENO, uid);
5197 if (r < 0) {
5198 *exit_status = EXIT_STDIN;
5199 return log_error_errno(r, "Failed to change ownership of terminal: %m");
5200 }
5201 }
5202
5203 if (params->cgroup_path) {
5204 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5205 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5206 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5207 * touch a single hierarchy too. */
5208
5209 if (params->flags & EXEC_CGROUP_DELEGATE) {
5210 _cleanup_free_ char *p = NULL;
5211
5212 r = cg_set_access(params->cgroup_path, uid, gid);
5213 if (r < 0) {
5214 *exit_status = EXIT_CGROUP;
5215 return log_error_errno(r, "Failed to adjust control group access: %m");
5216 }
5217
5218 r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
5219 if (r < 0) {
5220 *exit_status = EXIT_CGROUP;
5221 return log_error_errno(r, "Failed to acquire cgroup path: %m");
5222 }
5223 if (r > 0) {
5224 r = cg_set_access_recursive(p, uid, gid);
5225 if (r < 0) {
5226 *exit_status = EXIT_CGROUP;
5227 return log_error_errno(r, "Failed to adjust control subgroup access: %m");
5228 }
5229 }
5230 }
5231
5232 if (is_pressure_supported() > 0) {
5233 if (cgroup_context_want_memory_pressure(cgroup_context)) {
5234 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5235 if (r < 0) {
5236 *exit_status = EXIT_MEMORY;
5237 return log_oom();
5238 }
5239
5240 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5241 if (r < 0) {
5242 log_full_errno(r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5243 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5244 memory_pressure_path = mfree(memory_pressure_path);
5245 }
5246 /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
5247 * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
5248 * pressure path environment variable or read-write mount to the unit. This is why we check if
5249 * memory_pressure_path != NULL in the conditional below. */
5250 if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
5251 memory_pressure_path = mfree(memory_pressure_path);
5252 r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
5253 if (r < 0) {
5254 *exit_status = EXIT_MEMORY;
5255 return log_oom();
5256 }
5257 }
5258 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) {
5259 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5260 if (!memory_pressure_path) {
5261 *exit_status = EXIT_MEMORY;
5262 return log_oom();
5263 }
5264 }
5265 }
5266 }
5267
5268 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5269
5270 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5271 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5272 if (r < 0)
5273 return log_error_errno(r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5274 }
5275
5276 r = exec_setup_credentials(context, cgroup_context, params, params->unit_id, uid, gid);
5277 if (r < 0) {
5278 *exit_status = EXIT_CREDENTIALS;
5279 return log_error_errno(r, "Failed to set up credentials: %m");
5280 }
5281
5282 r = build_environment(
5283 context,
5284 params,
5285 cgroup_context,
5286 n_fds,
5287 pwent_home,
5288 username,
5289 shell,
5290 journal_stream_dev,
5291 journal_stream_ino,
5292 memory_pressure_path,
5293 needs_sandboxing,
5294 &our_env);
5295 if (r < 0) {
5296 *exit_status = EXIT_MEMORY;
5297 return log_oom();
5298 }
5299
5300 r = build_pass_environment(context, &pass_env);
5301 if (r < 0) {
5302 *exit_status = EXIT_MEMORY;
5303 return log_oom();
5304 }
5305
5306 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5307 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5308 * not specify PATH but the unit has ExecSearchPath. */
5309 if (!strv_isempty(context->exec_search_path)) {
5310 _cleanup_free_ char *joined = NULL;
5311
5312 joined = strv_join(context->exec_search_path, ":");
5313 if (!joined) {
5314 *exit_status = EXIT_MEMORY;
5315 return log_oom();
5316 }
5317
5318 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5319 if (r < 0) {
5320 *exit_status = EXIT_MEMORY;
5321 return log_oom();
5322 }
5323 }
5324
5325 accum_env = strv_env_merge(params->environment,
5326 our_env,
5327 joined_exec_search_path,
5328 pass_env,
5329 context->environment,
5330 params->files_env);
5331 if (!accum_env) {
5332 *exit_status = EXIT_MEMORY;
5333 return log_oom();
5334 }
5335 strv_env_clean(accum_env);
5336
5337 (void) umask(context->umask);
5338
5339 r = setup_term_environment(context, &accum_env);
5340 if (r < 0) {
5341 *exit_status = EXIT_MEMORY;
5342 return log_error_errno(r, "Failed to construct $TERM: %m");
5343 }
5344
5345 r = setup_keyring(context, params, uid, gid);
5346 if (r < 0) {
5347 *exit_status = EXIT_KEYRING;
5348 return log_error_errno(r, "Failed to set up kernel keyring: %m");
5349 }
5350
5351 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5352 * excepted from either whole sandboxing or just setresuid() itself. */
5353 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5354
5355 uint64_t capability_ambient_set = context->capability_ambient_set;
5356
5357 /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
5358 have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
5359
5360 if (needs_sandboxing) {
5361 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5362 * /sys being present. The actual MAC context application will happen later, as late as
5363 * possible, to avoid impacting our own code paths. */
5364
5365 #if HAVE_SELINUX
5366 use_selinux = mac_selinux_use();
5367 #endif
5368 #if ENABLE_SMACK
5369 use_smack = mac_smack_use();
5370 #endif
5371 #if HAVE_APPARMOR
5372 if (mac_apparmor_use()) {
5373 r = dlopen_libapparmor();
5374 if (r < 0 && !ERRNO_IS_NEG_NOT_SUPPORTED(r))
5375 log_warning_errno(r, "Failed to load libapparmor, ignoring: %m");
5376 use_apparmor = r >= 0;
5377 }
5378 #endif
5379 }
5380
5381 if (needs_sandboxing) {
5382 int which_failed;
5383
5384 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5385 * is set here. (See below.) */
5386
5387 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5388 if (r < 0) {
5389 *exit_status = EXIT_LIMITS;
5390 return log_error_errno(r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5391 }
5392 }
5393
5394 if (needs_setuid && context->pam_name && username) {
5395 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
5396 * wins here. (See above.) */
5397
5398 /* All fds passed in the fds array will be closed in the pam child process. */
5399 r = setup_pam(context, cgroup_context, params, username, uid, gid, &accum_env,
5400 params->fds, n_fds, needs_sandboxing, params->exec_fd);
5401 if (r < 0) {
5402 *exit_status = EXIT_PAM;
5403 return log_error_errno(r, "Failed to set up PAM session: %m");
5404 }
5405
5406 /* PAM modules might have set some ambient caps. Query them here and merge them into
5407 * the caps we want to set in the end, so that we don't end up unsetting them. */
5408 uint64_t ambient_after_pam;
5409 r = capability_get_ambient(&ambient_after_pam);
5410 if (r < 0) {
5411 *exit_status = EXIT_CAPABILITIES;
5412 return log_error_errno(r, "Failed to query ambient caps: %m");
5413 }
5414
5415 capability_ambient_set |= ambient_after_pam;
5416
5417 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5418 if (ngids_after_pam < 0) {
5419 *exit_status = EXIT_GROUP;
5420 return log_error_errno(ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5421 }
5422 }
5423
5424 if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) {
5425 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5426 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5427 * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5428 PrivateUsers pu = exec_context_get_effective_private_users(context, params);
5429 if (pu == PRIVATE_USERS_NO)
5430 pu = PRIVATE_USERS_SELF;
5431
5432 /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
5433 * unprivileged user namespaces. */
5434 r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false);
5435 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5436 * the actual requested operations fail (or silently continue). */
5437 if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
5438 *exit_status = EXIT_USER;
5439 return log_error_errno(r, "Failed to set up user namespacing for unprivileged user: %m");
5440 }
5441 if (r < 0)
5442 log_info_errno(r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5443 else {
5444 assert(r > 0);
5445 userns_set_up = true;
5446 log_debug("Set up unprivileged user namespace");
5447 }
5448 }
5449
5450 /* Call setup_delegated_namespaces() the first time to unshare all non-delegated namespaces. */
5451 r = setup_delegated_namespaces(
5452 context,
5453 params,
5454 runtime,
5455 /* delegate= */ false,
5456 memory_pressure_path,
5457 uid,
5458 gid,
5459 command,
5460 needs_sandboxing,
5461 have_cap_sys_admin,
5462 exit_status);
5463 if (r < 0)
5464 return r;
5465
5466 /* Drop groups as early as possible.
5467 * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
5468 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5469 if (needs_setuid) {
5470 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5471 int ngids_to_enforce;
5472
5473 ngids_to_enforce = merge_gid_lists(gids,
5474 ngids,
5475 gids_after_pam,
5476 ngids_after_pam,
5477 &gids_to_enforce);
5478 if (ngids_to_enforce < 0) {
5479 *exit_status = EXIT_GROUP;
5480 return log_error_errno(ngids_to_enforce, "Failed to merge group lists. Group membership might be incorrect: %m");
5481 }
5482
5483 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5484 if (r < 0) {
5485 *exit_status = EXIT_GROUP;
5486 return log_error_errno(r, "Changing group credentials failed: %m");
5487 }
5488 }
5489
5490 /* If the user namespace was not set up above, try to do it now.
5491 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5492 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5493 * case of mount namespaces being less privileged when the mount point list is copied from a
5494 * different user namespace). */
5495
5496 if (needs_sandboxing && !userns_set_up) {
5497 PrivateUsers pu = exec_context_get_effective_private_users(context, params);
5498
5499 r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
5500 /* allow_setgroups= */ pu == PRIVATE_USERS_FULL);
5501 if (r < 0) {
5502 *exit_status = EXIT_USER;
5503 return log_error_errno(r, "Failed to set up user namespacing: %m");
5504 }
5505 if (r > 0)
5506 log_debug("Set up privileged user namespace");
5507 }
5508
5509 /* Call setup_delegated_namespaces() the second time to unshare all delegated namespaces. */
5510 r = setup_delegated_namespaces(
5511 context,
5512 params,
5513 runtime,
5514 /* delegate= */ true,
5515 memory_pressure_path,
5516 uid,
5517 gid,
5518 command,
5519 needs_sandboxing,
5520 have_cap_sys_admin,
5521 exit_status);
5522 if (r < 0)
5523 return r;
5524
5525 if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
5526 /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
5527 * ensures the root of the cgroup namespace is the top level service cgroup and not the
5528 * subcgroup. Adjust the prefix accordingly since we're in a cgroup namespace now. */
5529 r = attach_to_subcgroup(context, cgroup_context, params, /* prefix= */ NULL);
5530 if (r < 0) {
5531 *exit_status = EXIT_CGROUP;
5532 return r;
5533 }
5534 }
5535
5536 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5537 * shall execute. */
5538
5539 const char *path = command->path;
5540
5541 if (FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
5542 if (shell_is_placeholder(shell)) {
5543 log_debug("Shell prefixing requested for user without default shell, using /bin/sh: %s",
5544 strna(username));
5545 assert(streq(path, _PATH_BSHELL));
5546 } else
5547 path = shell;
5548 }
5549
5550 _cleanup_free_ char *executable = NULL;
5551 _cleanup_close_ int executable_fd = -EBADF;
5552 r = find_executable_full(path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5553 if (r < 0) {
5554 *exit_status = EXIT_EXEC;
5555 log_struct_errno(LOG_NOTICE, r,
5556 LOG_MESSAGE_ID(SD_MESSAGE_SPAWN_FAILED_STR),
5557 LOG_EXEC_MESSAGE(params, "Unable to locate executable '%s': %m", path),
5558 LOG_ITEM("EXECUTABLE=%s", path));
5559 /* If the error will be ignored by manager, tune down the log level here. Missing executable
5560 * is very much expected in this case. */
5561 return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
5562 }
5563
5564 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
5565 if (r < 0) {
5566 *exit_status = EXIT_FDS;
5567 return log_error_errno(r, "Failed to collect shifted fd: %m");
5568 }
5569
5570 #if HAVE_SELINUX
5571 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5572 int fd = -EBADF;
5573
5574 if (socket_fd >= 0)
5575 fd = socket_fd;
5576 else if (params->n_socket_fds == 1)
5577 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5578 * use context from that fd to compute the label. */
5579 fd = params->fds[0];
5580
5581 if (fd >= 0) {
5582 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5583 if (r < 0) {
5584 if (!context->selinux_context_ignore) {
5585 *exit_status = EXIT_SELINUX_CONTEXT;
5586 return log_error_errno(r, "Failed to determine SELinux context: %m");
5587 }
5588 log_debug_errno(r, "Failed to determine SELinux context, ignoring: %m");
5589 }
5590 }
5591 }
5592 #endif
5593
5594 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5595 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5596 * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
5597 * them open until the final execve(). But first, close the remaining sockets in the context
5598 * objects. */
5599
5600 exec_runtime_close(runtime);
5601 exec_params_close(params);
5602
5603 r = close_all_fds(keep_fds, n_keep_fds);
5604 if (r >= 0)
5605 r = pack_fds(params->fds, n_fds);
5606 if (r >= 0)
5607 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
5608 if (r < 0) {
5609 *exit_status = EXIT_FDS;
5610 return log_error_errno(r, "Failed to adjust passed file descriptors: %m");
5611 }
5612
5613 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5614 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5615 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5616 * came this far. */
5617
5618 secure_bits = context->secure_bits;
5619
5620 if (needs_sandboxing) {
5621 uint64_t bset;
5622
5623 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5624 * (Note this is placed after the general resource limit initialization, see above, in order
5625 * to take precedence.) */
5626 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5627 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5628 *exit_status = EXIT_LIMITS;
5629 return log_error_errno(errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5630 }
5631 }
5632
5633 #if ENABLE_SMACK
5634 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5635 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5636 if (use_smack) {
5637 r = setup_smack(context, params, executable_fd);
5638 if (r < 0 && !context->smack_process_label_ignore) {
5639 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5640 return log_error_errno(r, "Failed to set SMACK process label: %m");
5641 }
5642 }
5643 #endif
5644
5645 bset = context->capability_bounding_set;
5646
5647 #if HAVE_SECCOMP
5648 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
5649 * keep the needed privileges to apply it even if we're not root. */
5650 if (needs_setuid &&
5651 uid_is_valid(uid) &&
5652 context_has_seccomp(context) &&
5653 seccomp_allows_drop_privileges(context)) {
5654 keep_seccomp_privileges = true;
5655
5656 if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
5657 *exit_status = EXIT_USER;
5658 return log_error_errno(errno, "Failed to enable keep capabilities flag: %m");
5659 }
5660
5661 /* Save the current bounding set so we can restore it after applying the seccomp
5662 * filter */
5663 saved_bset = bset;
5664 bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
5665 (UINT64_C(1) << CAP_SETPCAP);
5666 }
5667 #endif
5668
5669 if (!cap_test_all(bset)) {
5670 r = capability_bounding_set_drop(bset, /* right_now= */ false);
5671 if (r < 0) {
5672 *exit_status = EXIT_CAPABILITIES;
5673 return log_error_errno(r, "Failed to drop capabilities: %m");
5674 }
5675 }
5676
5677 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5678 * keep-caps set.
5679 *
5680 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5681 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5682 * the ambient capabilities can be raised as they are present in the permitted and
5683 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5684 * without changing the user, so we also set the ambient capabilities here.
5685 *
5686 * The requested ambient capabilities are raised in the inheritable set if the second
5687 * argument is true. */
5688 if (capability_ambient_set != 0) {
5689 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5690 if (r < 0) {
5691 *exit_status = EXIT_CAPABILITIES;
5692 return log_error_errno(r, "Failed to apply ambient capabilities (before UID change): %m");
5693 }
5694 }
5695 }
5696
5697 /* chroot to root directory first, before we lose the ability to chroot */
5698 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5699 if (r < 0)
5700 return log_error_errno(r, "Chrooting to the requested root directory failed: %m");
5701
5702 if (needs_setuid) {
5703 if (uid_is_valid(uid)) {
5704 r = enforce_user(context, uid, capability_ambient_set);
5705 if (r < 0) {
5706 *exit_status = EXIT_USER;
5707 return log_error_errno(r, "Failed to change UID to " UID_FMT ": %m", uid);
5708 }
5709
5710 if (keep_seccomp_privileges) {
5711 if (!BIT_SET(capability_ambient_set, CAP_SETUID)) {
5712 r = drop_capability(CAP_SETUID);
5713 if (r < 0) {
5714 *exit_status = EXIT_USER;
5715 return log_error_errno(r, "Failed to drop CAP_SETUID: %m");
5716 }
5717 }
5718
5719 r = keep_capability(CAP_SYS_ADMIN);
5720 if (r < 0) {
5721 *exit_status = EXIT_USER;
5722 return log_error_errno(r, "Failed to keep CAP_SYS_ADMIN: %m");
5723 }
5724
5725 r = keep_capability(CAP_SETPCAP);
5726 if (r < 0) {
5727 *exit_status = EXIT_USER;
5728 return log_error_errno(r, "Failed to keep CAP_SETPCAP: %m");
5729 }
5730 }
5731
5732 if (capability_ambient_set != 0) {
5733
5734 /* Raise the ambient capabilities after user change. */
5735 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5736 if (r < 0) {
5737 *exit_status = EXIT_CAPABILITIES;
5738 return log_error_errno(r, "Failed to apply ambient capabilities (after UID change): %m");
5739 }
5740 }
5741 }
5742 }
5743
5744 /* Apply working directory here, because the working directory might be on NFS and only the user
5745 * running this service might have the correct privilege to change to the working directory. Also, it
5746 * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5747 * the cwd cannot be used to pin directories outside of the sandbox. */
5748 r = apply_working_directory(context, params, runtime, pwent_home, accum_env);
5749 if (r < 0) {
5750 *exit_status = EXIT_CHDIR;
5751 return log_error_errno(r, "Changing to the requested working directory failed: %m");
5752 }
5753
5754 if (needs_sandboxing) {
5755 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5756 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5757 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5758 * are restricted. */
5759
5760 #if HAVE_SELINUX
5761 if (use_selinux) {
5762 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5763
5764 if (exec_context) {
5765 r = setexeccon(exec_context);
5766 if (r < 0) {
5767 if (!context->selinux_context_ignore) {
5768 *exit_status = EXIT_SELINUX_CONTEXT;
5769 return log_error_errno(r, "Failed to change SELinux context to %s: %m", exec_context);
5770 }
5771 log_debug_errno(r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5772 }
5773 }
5774 }
5775 #endif
5776
5777 #if HAVE_APPARMOR
5778 if (use_apparmor && context->apparmor_profile) {
5779 r = ASSERT_PTR(sym_aa_change_onexec)(context->apparmor_profile);
5780 if (r < 0 && !context->apparmor_profile_ignore) {
5781 *exit_status = EXIT_APPARMOR_PROFILE;
5782 return log_error_errno(errno, "Failed to prepare AppArmor profile change to %s: %m",
5783 context->apparmor_profile);
5784 }
5785 }
5786 #endif
5787
5788 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5789 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5790 * requires CAP_SETPCAP. */
5791 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5792 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5793 * effective set here.
5794 *
5795 * The effective set is overwritten during execve() with the following values:
5796 *
5797 * - ambient set (for non-root processes)
5798 *
5799 * - (inheritable | bounding) set for root processes)
5800 *
5801 * Hence there is no security impact to raise it in the effective set before execve
5802 */
5803 r = capability_gain_cap_setpcap(/* ret_before_caps = */ NULL);
5804 if (r < 0) {
5805 *exit_status = EXIT_CAPABILITIES;
5806 return log_error_errno(r, "Failed to gain CAP_SETPCAP for setting secure bits");
5807 }
5808 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5809 *exit_status = EXIT_SECUREBITS;
5810 return log_error_errno(errno, "Failed to set process secure bits: %m");
5811 }
5812 }
5813
5814 if (context_has_no_new_privileges(context))
5815 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5816 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5817 return log_error_errno(errno, "Failed to disable new privileges: %m");
5818 }
5819
5820 #if HAVE_SECCOMP
5821 r = apply_address_families(context, params);
5822 if (r < 0) {
5823 *exit_status = EXIT_ADDRESS_FAMILIES;
5824 return log_error_errno(r, "Failed to restrict address families: %m");
5825 }
5826
5827 r = apply_memory_deny_write_execute(context, params);
5828 if (r < 0) {
5829 *exit_status = EXIT_SECCOMP;
5830 return log_error_errno(r, "Failed to disable writing to executable memory: %m");
5831 }
5832
5833 r = apply_restrict_realtime(context, params);
5834 if (r < 0) {
5835 *exit_status = EXIT_SECCOMP;
5836 return log_error_errno(r, "Failed to apply realtime restrictions: %m");
5837 }
5838
5839 r = apply_restrict_suid_sgid(context, params);
5840 if (r < 0) {
5841 *exit_status = EXIT_SECCOMP;
5842 return log_error_errno(r, "Failed to apply SUID/SGID restrictions: %m");
5843 }
5844
5845 r = apply_restrict_namespaces(context, params);
5846 if (r < 0) {
5847 *exit_status = EXIT_SECCOMP;
5848 return log_error_errno(r, "Failed to apply namespace restrictions: %m");
5849 }
5850
5851 r = apply_protect_sysctl(context, params);
5852 if (r < 0) {
5853 *exit_status = EXIT_SECCOMP;
5854 return log_error_errno(r, "Failed to apply sysctl restrictions: %m");
5855 }
5856
5857 r = apply_protect_kernel_modules(context, params);
5858 if (r < 0) {
5859 *exit_status = EXIT_SECCOMP;
5860 return log_error_errno(r, "Failed to apply module loading restrictions: %m");
5861 }
5862
5863 r = apply_protect_kernel_logs(context, params);
5864 if (r < 0) {
5865 *exit_status = EXIT_SECCOMP;
5866 return log_error_errno(r, "Failed to apply kernel log restrictions: %m");
5867 }
5868
5869 r = apply_protect_clock(context, params);
5870 if (r < 0) {
5871 *exit_status = EXIT_SECCOMP;
5872 return log_error_errno(r, "Failed to apply clock restrictions: %m");
5873 }
5874
5875 r = apply_private_devices(context, params);
5876 if (r < 0) {
5877 *exit_status = EXIT_SECCOMP;
5878 return log_error_errno(r, "Failed to set up private devices: %m");
5879 }
5880
5881 r = apply_syscall_archs(context, params);
5882 if (r < 0) {
5883 *exit_status = EXIT_SECCOMP;
5884 return log_error_errno(r, "Failed to apply syscall architecture restrictions: %m");
5885 }
5886
5887 r = apply_lock_personality(context, params);
5888 if (r < 0) {
5889 *exit_status = EXIT_SECCOMP;
5890 return log_error_errno(r, "Failed to lock personalities: %m");
5891 }
5892
5893 r = apply_syscall_log(context, params);
5894 if (r < 0) {
5895 *exit_status = EXIT_SECCOMP;
5896 return log_error_errno(r, "Failed to apply system call log filters: %m");
5897 }
5898 #endif
5899
5900 #if HAVE_LIBBPF
5901 r = apply_restrict_filesystems(context, params);
5902 if (r < 0) {
5903 *exit_status = EXIT_BPF;
5904 return log_error_errno(r, "Failed to restrict filesystems: %m");
5905 }
5906 #endif
5907
5908 #if HAVE_SECCOMP
5909 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5910 * by the filter as little as possible. */
5911 r = apply_syscall_filter(context, params);
5912 if (r < 0) {
5913 *exit_status = EXIT_SECCOMP;
5914 return log_error_errno(r, "Failed to apply system call filters: %m");
5915 }
5916
5917 if (keep_seccomp_privileges) {
5918 /* Restore the capability bounding set with what's expected from the service + the
5919 * ambient capabilities hack */
5920 if (!cap_test_all(saved_bset)) {
5921 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5922 if (r < 0) {
5923 *exit_status = EXIT_CAPABILITIES;
5924 return log_error_errno(r, "Failed to drop bset capabilities: %m");
5925 }
5926 }
5927
5928 /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5929 * applications that use it. */
5930 if (!BIT_SET(saved_bset, CAP_SYS_ADMIN)) {
5931 r = drop_capability(CAP_SYS_ADMIN);
5932 if (r < 0) {
5933 *exit_status = EXIT_USER;
5934 return log_error_errno(r, "Failed to drop CAP_SYS_ADMIN: %m");
5935 }
5936 }
5937
5938 /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5939 * applications that use it. */
5940 if (!BIT_SET(saved_bset, CAP_SETPCAP)) {
5941 r = drop_capability(CAP_SETPCAP);
5942 if (r < 0) {
5943 *exit_status = EXIT_USER;
5944 return log_error_errno(r, "Failed to drop CAP_SETPCAP: %m");
5945 }
5946 }
5947
5948 if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5949 *exit_status = EXIT_USER;
5950 return log_error_errno(errno, "Failed to drop keep capabilities flag: %m");
5951 }
5952 }
5953 #endif
5954
5955 }
5956
5957 if (!strv_isempty(context->unset_environment)) {
5958 char **ee = NULL;
5959
5960 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5961 if (!ee) {
5962 *exit_status = EXIT_MEMORY;
5963 return log_oom();
5964 }
5965
5966 strv_free_and_replace(accum_env, ee);
5967 }
5968
5969 _cleanup_strv_free_ char **replaced_argv = NULL, **argv_via_shell = NULL;
5970 char **final_argv = FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL) ? strv_skip(command->argv, 1) : command->argv;
5971
5972 if (final_argv && !FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5973 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5974
5975 r = replace_env_argv(final_argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5976 if (r < 0) {
5977 *exit_status = EXIT_MEMORY;
5978 return log_error_errno(r, "Failed to replace environment variables: %m");
5979 }
5980 final_argv = replaced_argv;
5981
5982 if (!strv_isempty(unset_variables)) {
5983 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5984 log_warning("Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5985 }
5986
5987 if (!strv_isempty(bad_variables)) {
5988 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5989 log_warning("Invalid environment variable name evaluates to an empty string: %s", strna(jb));
5990 }
5991 }
5992
5993 if (FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
5994 r = strv_extendf(&argv_via_shell, "%s%s", command->argv[0][0] == '-' ? "-" : "", path);
5995 if (r < 0) {
5996 *exit_status = EXIT_MEMORY;
5997 return log_oom();
5998 }
5999
6000 if (!strv_isempty(final_argv)) {
6001 _cleanup_free_ char *cmdline_joined = NULL;
6002
6003 cmdline_joined = strv_join(final_argv, " ");
6004 if (!cmdline_joined) {
6005 *exit_status = EXIT_MEMORY;
6006 return log_oom();
6007 }
6008
6009 r = strv_extend_many(&argv_via_shell, "-c", cmdline_joined);
6010 if (r < 0) {
6011 *exit_status = EXIT_MEMORY;
6012 return log_oom();
6013 }
6014 }
6015
6016 final_argv = argv_via_shell;
6017 }
6018
6019 log_command_line(context, params, "Executing", executable, final_argv);
6020
6021 /* We have finished with all our initializations. Let's now let the manager know that. From this
6022 * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
6023
6024 r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status);
6025 if (r < 0)
6026 return r;
6027
6028 /* As last thing before the execve(), let's send the handoff timestamp */
6029 r = send_handoff_timestamp(context, params, exit_status);
6030 if (r < 0) {
6031 /* If this handoff timestamp failed, let's undo the marking as hot */
6032 (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
6033 return r;
6034 }
6035
6036 /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
6037 * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
6038 * exec_fd this is pretty much the whole raison d'etre. */
6039
6040 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
6041
6042 /* The execve() failed, let's undo the marking as hot */
6043 (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
6044
6045 *exit_status = EXIT_EXEC;
6046 return log_error_errno(r, "Failed to execute %s: %m", executable);
6047 }